brennen
/
bpb-kit


								#!/home/brennen/python_bullshit/bin/python3


								"""

								    This script collects metadata from VimWiki pages in my notes.

								"""


								import os

								import re

								import sys

								import sqlite3

								from pprint import pprint

								from panflute import *


								def resolve_target(target, page):

								    """Resolve the target of a link to a canonical name."""


								    if re.search('https?://|file:|tel:|mailto:', target):

								        return target


								    # Special case diary entries because Pandoc returns them as relative links.

								    # This will break if you're trying to link to foo/diary/2023-01-01 instead

								    # of /diary/2023-01-01, but at least within my notes that's not something

								    # I do, and it's probably a rare case in general.

								    if re.search('diary/\d{4}-\d{2}-\d{2}', target):

								        return target


								    # At this point, we should be fairly confident the link is a wiki page.

								    # Normalize it by removing the .html extension that pandoc throws on there:

								    if target.endswith('.html'):

								        target = os.path.splitext(target)[0]


								    # Check for an absolute path (within the wiki):

								    if target.startswith('/'):

								        # Note this might behave weirdly with the leading // thing, but I don't

								        # _think_ Pandoc supports that anyway:

								        return target.replace('/', '', 1)


								    page_elements = page.split('/')

								    # Get rid of the page name:

								    page_elements.pop()


								    path_elements = page_elements + target.split('/')


								    resolved_path = []

								    while len(path_elements) > 0:

								        element = path_elements.pop()

								        if element == '..' and len(path_elements) > 0:

								            # Discard a directory:

								            path_elements.pop()

								        else:

								            resolved_path.append(element)


								    resolved_path.reverse()

								    return '/'.join(resolved_path)


								def extract_values(elem, doc):

								    """Extract links from given document, write to sqlite."""


								    # pprint(elem)

								    if isinstance(elem, Link):

								        link_target = elem.url


								        # Skip in-page anchors, for now:

								        if link_target.startswith('#'):

								            return


								        # Insert a row of data

								        c.execute(

								            "INSERT OR IGNORE INTO links VALUES (?, ?)",

								            (

								                pagename,

								                resolve_target(link_target, pagename)

								            )

								        )


								        conn.commit()


								# Ensure we're in the wiki directory:

								notes_dir = os.path.join(os.getenv('HOME'), 'notes')

								vimwiki_dir = os.path.join(notes_dir, 'vimwiki')

								os.chdir(notes_dir)


								conn = sqlite3.connect('metadata.db')

								c = conn.cursor()


								date_re = re.compile('diary/([0-9]{4}-[0-9]{2}-[0-9]{2})')


								for input_file in sys.argv[1:]:


								    # Trim leading directory and .wiki extension:

								    # XXX: This fails badly if in a symlinked path

								    input_file_abspath = os.path.abspath(input_file)

								    pagename = input_file_abspath.replace(vimwiki_dir + os.sep, '', 1)

								    pagename = os.path.splitext(pagename)[0]


								    # XXX: do real logging

								    # print(pagename)


								    # _logscratch is for ephemeral renderings of logs for other pages - skip

								    # collecting any metadata for these pages:

								    if pagename.startswith('_logscratch'):

								        continue


								    with open(input_file_abspath) as page:

								        doc = convert_text(

								            page.read(),

								            input_format='vimwiki',

								            standalone=True

								        )


								    # XXX: This should fall back to headers

								    title = doc.get_metadata('title')

								    if not title:

								        title = pagename


								    # This falls back to diary paths for date metadata:

								    date = doc.get_metadata('date')

								    if not date:

								        date_match_result = re.match(date_re, pagename)

								        if date_match_result:

								            # XXX: This needs to include timezone somehow or another

								            #      THIS IS A DIRTY HACK, MAKE IT USE THE ACTUAL CURRENT TZ

								            date = date_match_result.group(1) + " 00:00:00.000000000-07:00"


								    # Log the name and metadata of the page:

								    c.execute("DELETE FROM pages WHERE page = ?", (pagename,))

								    c.execute("INSERT INTO pages VALUES (?, ?, ?)", (pagename, title, date))


								    # Clear any links from this page in case something's been deleted:

								    c.execute("DELETE FROM links WHERE page = ?", (pagename,))


								    conn.commit()


								    doc.walk(extract_values)


								conn.close()