brennen
/
bpb-kit

#!/home/brennen/python_bullshit/bin/python3
"""    This script collects metadata from VimWiki pages in my notes."""
import osimport reimport sysimport sqlite3from pprint import pprintfrom panflute import *

def resolve_target(target, page):    """Resolve the target of a link to a canonical name."""
    if re.search('https?://|file:|tel:|mailto:', target):        return target
    # Special case diary entries because Pandoc returns them as relative links.    # This will break if you're trying to link to foo/diary/2023-01-01 instead    # of /diary/2023-01-01, but at least within my notes that's not something    # I do, and it's probably a rare case in general.    if re.search('diary/\d{4}-\d{2}-\d{2}', target):        return target
    # At this point, we should be fairly confident the link is a wiki page.    # Normalize it by removing the .html extension that pandoc throws on there:    if target.endswith('.html'):        target = os.path.splitext(target)[0]
    # Check for an absolute path (within the wiki):    if target.startswith('/'):        # Note this might behave weirdly with the leading // thing, but I don't        # _think_ Pandoc supports that anyway:        return target.replace('/', '', 1)
    page_elements = page.split('/')    # Get rid of the page name:    page_elements.pop()
    path_elements = page_elements + target.split('/')
    resolved_path = []    while len(path_elements) > 0:        element = path_elements.pop()        if element == '..' and len(path_elements) > 0:            # Discard a directory:            path_elements.pop()        else:            resolved_path.append(element)
    resolved_path.reverse()    return '/'.join(resolved_path)
def extract_values(elem, doc):    """Extract links from given document, write to sqlite."""
    # pprint(elem)    if isinstance(elem, Link):        link_target = elem.url
        # Skip in-page anchors, for now:        if link_target.startswith('#'):            return
        # Insert a row of data        c.execute(            "INSERT OR IGNORE INTO links VALUES (?, ?)",            (                pagename,                resolve_target(link_target, pagename)            )        )
        conn.commit()
# Ensure we're in the wiki directory:notes_dir = os.path.join(os.getenv('HOME'), 'notes')vimwiki_dir = os.path.join(notes_dir, 'vimwiki')os.chdir(notes_dir)
conn = sqlite3.connect('metadata.db')c = conn.cursor()
date_re = re.compile('diary/([0-9]{4}-[0-9]{2}-[0-9]{2})')
for input_file in sys.argv[1:]:
    # Trim leading directory and .wiki extension:    # XXX: This fails badly if in a symlinked path    input_file_abspath = os.path.abspath(input_file)    pagename = input_file_abspath.replace(vimwiki_dir + os.sep, '', 1)    pagename = os.path.splitext(pagename)[0]
    # XXX: do real logging    # print(pagename)
    # _logscratch is for ephemeral renderings of logs for other pages - skip    # collecting any metadata for these pages:    if pagename.startswith('_logscratch'):        continue
    with open(input_file_abspath) as page:        doc = convert_text(            page.read(),            input_format='vimwiki',            standalone=True        )
    # XXX: This should fall back to headers    title = doc.get_metadata('title')    if not title:        title = pagename
    # This falls back to diary paths for date metadata:    date = doc.get_metadata('date')    if not date:        date_match_result = re.match(date_re, pagename)        if date_match_result:            # XXX: This needs to include timezone somehow or another            #      THIS IS A DIRTY HACK, MAKE IT USE THE ACTUAL CURRENT TZ            date = date_match_result.group(1) + " 00:00:00.000000000-07:00"
    # Log the name and metadata of the page:    c.execute("DELETE FROM pages WHERE page = ?", (pagename,))    c.execute("INSERT INTO pages VALUES (?, ?, ?)", (pagename, title, date))
    # Clear any links from this page in case something's been deleted:    c.execute("DELETE FROM links WHERE page = ?", (pagename,))
    conn.commit()
    doc.walk(extract_values)
conn.close()