#!/home/brennen/python_bullshit/bin/python3 """ This script collects metadata from VimWiki pages in my notes. """ import os import re import sys import sqlite3 import re from pprint import pprint from panflute import * def resolve_target(target, page): """Resolve the target of a link to a canonical name.""" if re.search('https?://|file:|tel:|mailto:', target): return target # Special case diary entries because Pandoc returns them as relative links. # This will break if you're trying to link to foo/diary/2023-01-01 instead # of /diary/2023-01-01, but at least within my notes that's not something # I do, and it's probably a rare case in general. if re.search('diary/\d{4}-\d{2}-\d{2}', target): return target # At this point, we should be fairly confident the link is a wiki page. # Normalize it by removing the .html extension that pandoc throws on there: if target.endswith('.html'): target = os.path.splitext(target)[0] # Check for an absolute path (within the wiki): if target.startswith('/'): # Note this might behave weirdly with the leading // thing, but I don't # _think_ Pandoc supports that anyway: return target.replace('/', '', 1) page_elements = page.split('/') # Get rid of the page name: page_elements.pop() path_elements = page_elements + target.split('/') resolved_path = [] while len(path_elements) > 0: element = path_elements.pop() if element == '..' and len(path_elements) > 0: # Discard a directory: path_elements.pop() else: resolved_path.append(element) resolved_path.reverse() return '/'.join(resolved_path) def extract_values(elem, doc): """Extract links from given document, write to sqlite.""" # pprint(elem) if isinstance(elem, Link): link_target = elem.url # Skip in-page anchors, for now: if link_target.startswith('#'): return # Insert a row of data c.execute( "INSERT OR IGNORE INTO links VALUES (?, ?)", ( pagename, resolve_target(link_target, pagename) ) ) conn.commit() # Ensure we're in the wiki directory: notes_dir = os.path.join(os.getenv('HOME'), 'notes') vimwiki_dir = os.path.join(notes_dir, 'vimwiki') os.chdir(notes_dir) conn = sqlite3.connect('metadata.db') c = conn.cursor() date_re = re.compile('diary/([0-9]{4}-[0-9]{2}-[0-9]{2})') for input_file in sys.argv[1:]: # Trim leading directory and .wiki extension: # XXX: This fails badly if in a symlinked path input_file_abspath = os.path.abspath(input_file) pagename = input_file_abspath.replace(vimwiki_dir + os.sep, '', 1) pagename = os.path.splitext(pagename)[0] # XXX: do real logging # print(pagename) # _logscratch is for ephemeral renderings of logs for other pages - skip # collecting any metadata for these pages: if pagename.startswith('_logscratch'): continue with open(input_file_abspath) as page: doc = convert_text( page.read(), input_format='vimwiki', standalone=True ) # XXX: This should fall back to headers title = doc.get_metadata('title') if not title: title = pagename # This falls back to diary paths for date metadata: date = doc.get_metadata('date') if not date: date_match_result = re.match(date_re, pagename) if date_match_result: # XXX: This needs to include timezone somehow or another # THIS IS A DIRTY HACK, MAKE IT USE THE ACTUAL CURRENT TZ date = date_match_result.group(1) + " 00:00:00.000000000-07:00" # Log the name and metadata of the page: c.execute("DELETE FROM pages WHERE page = ?", (pagename,)) c.execute("INSERT INTO pages VALUES (?, ?, ?)", (pagename, title, date)) # Clear any links from this page in case something's been deleted: c.execute("DELETE FROM links WHERE page = ?", (pagename,)) conn.commit() doc.walk(extract_values) conn.close()