Dotfiles, utilities, and other apparatus.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

137 lines
4.1 KiB

#!/home/brennen/python_bullshit/bin/python3
"""
This script collects metadata from VimWiki pages in my notes.
"""
import os
import re
import sys
import sqlite3
import re
from pprint import pprint
from panflute import *
def resolve_target(target, page):
"""Resolve the target of a link to a canonical name."""
if re.search('https?://|file:|tel:|mailto:', target):
return target
# Special case diary entries because Pandoc returns them as relative links.
# This will break if you're trying to link to foo/diary/2023-01-01 instead
# of /diary/2023-01-01, but at least within my notes that's not something
# I do, and it's probably a rare case in general.
if re.search('diary/\d{4}-\d{2}-\d{2}', target):
return target
# At this point, we should be fairly confident the link is a wiki page.
# Normalize it by removing the .html extension that pandoc throws on there:
if target.endswith('.html'):
target = os.path.splitext(target)[0]
# Check for an absolute path (within the wiki):
if target.startswith('/'):
# Note this might behave weirdly with the leading // thing, but I don't
# _think_ Pandoc supports that anyway:
return target.replace('/', '', 1)
page_elements = page.split('/')
# Get rid of the page name:
page_elements.pop()
path_elements = page_elements + target.split('/')
resolved_path = []
while len(path_elements) > 0:
element = path_elements.pop()
if element == '..' and len(path_elements) > 0:
# Discard a directory:
path_elements.pop()
else:
resolved_path.append(element)
resolved_path.reverse()
return '/'.join(resolved_path)
def extract_values(elem, doc):
"""Extract links from given document, write to sqlite."""
# pprint(elem)
if isinstance(elem, Link):
link_target = elem.url
# Skip in-page anchors, for now:
if link_target.startswith('#'):
return
# Insert a row of data
c.execute(
"INSERT OR IGNORE INTO links VALUES (?, ?)",
(
pagename,
resolve_target(link_target, pagename)
)
)
conn.commit()
# Ensure we're in the wiki directory:
notes_dir = os.path.join(os.getenv('HOME'), 'notes')
vimwiki_dir = os.path.join(notes_dir, 'vimwiki')
os.chdir(notes_dir)
conn = sqlite3.connect('metadata.db')
c = conn.cursor()
date_re = re.compile('diary/([0-9]{4}-[0-9]{2}-[0-9]{2})')
for input_file in sys.argv[1:]:
# Trim leading directory and .wiki extension:
# XXX: This fails badly if in a symlinked path
input_file_abspath = os.path.abspath(input_file)
pagename = input_file_abspath.replace(vimwiki_dir + os.sep, '', 1)
pagename = os.path.splitext(pagename)[0]
# XXX: do real logging
# print(pagename)
# _logscratch is for ephemeral renderings of logs for other pages - skip
# collecting any metadata for these pages:
if pagename.startswith('_logscratch'):
continue
with open(input_file_abspath) as page:
doc = convert_text(
page.read(),
input_format='vimwiki',
standalone=True
)
# XXX: This should fall back to headers
title = doc.get_metadata('title')
if not title:
title = pagename
# This falls back to diary paths for date metadata:
date = doc.get_metadata('date')
if not date:
date_match_result = re.match(date_re, pagename)
if date_match_result:
# XXX: This needs to include timezone somehow or another
# THIS IS A DIRTY HACK, MAKE IT USE THE ACTUAL CURRENT TZ
date = date_match_result.group(1) + " 00:00:00.000000000-07:00"
# Log the name and metadata of the page:
c.execute("DELETE FROM pages WHERE page = ?", (pagename,))
c.execute("INSERT INTO pages VALUES (?, ?, ?)", (pagename, title, date))
# Clear any links from this page in case something's been deleted:
c.execute("DELETE FROM links WHERE page = ?", (pagename,))
conn.commit()
doc.walk(extract_values)
conn.close()