Dotfiles, utilities, and other apparatus.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

137 lines
4.1 KiB

#!/usr/bin/env python3
"""
This script collects metadata from VimWiki pages in my notes.
"""
import os
import re
import sys
import sqlite3
import re
from pprint import pprint
from panflute import *
def resolve_target(target, page):
"""Resolve the target of a link to a canonical name."""
if re.search('https?://|file:|tel:|mailto:', target):
return target
# Special case diary entries because Pandoc returns them as relative links.
# This will break if you're trying to link to foo/diary/2023-01-01 instead
# of /diary/2023-01-01, but at least within my notes that's not something
# I do, and it's probably a rare case in general.
if re.search('diary/\d{4}-\d{2}-\d{2}', target):
return target
# At this point, we should be fairly confident the link is a wiki page.
# Normalize it by removing the .html extension that pandoc throws on there:
if target.endswith('.html'):
target = os.path.splitext(target)[0]
# Check for an absolute path (within the wiki):
if target.startswith('/'):
# Note this might behave weirdly with the leading // thing, but I don't
# _think_ Pandoc supports that anyway:
return target.replace('/', '', 1)
page_elements = page.split('/')
# Get rid of the page name:
page_elements.pop()
path_elements = page_elements + target.split('/')
resolved_path = []
while len(path_elements) > 0:
element = path_elements.pop()
if element == '..' and len(path_elements) > 0:
# Discard a directory:
path_elements.pop()
else:
resolved_path.append(element)
resolved_path.reverse()
return '/'.join(resolved_path)
def extract_values(elem, doc):
"""Extract links from given document, write to sqlite."""
# pprint(elem)
if isinstance(elem, Link):
link_target = elem.url
# Skip in-page anchors, for now:
if link_target.startswith('#'):
return
# Insert a row of data
c.execute(
"INSERT OR IGNORE INTO links VALUES (?, ?)",
(
pagename,
resolve_target(link_target, pagename)
)
)
conn.commit()
# Ensure we're in the wiki directory:
notes_dir = os.path.join(os.getenv('HOME'), 'notes')
vimwiki_dir = os.path.join(notes_dir, 'vimwiki')
os.chdir(notes_dir)
conn = sqlite3.connect('metadata.db')
c = conn.cursor()
date_re = re.compile('diary/([0-9]{4}-[0-9]{2}-[0-9]{2})')
for input_file in sys.argv[1:]:
# Trim leading directory and .wiki extension:
# XXX: This fails badly if in a symlinked path
input_file_abspath = os.path.abspath(input_file)
pagename = input_file_abspath.replace(vimwiki_dir + os.sep, '', 1)
pagename = os.path.splitext(pagename)[0]
# XXX: do real logging
# print(pagename)
# _logscratch is for ephemeral renderings of logs for other pages - skip
# collecting any metadata for these pages:
if pagename.startswith('_logscratch'):
continue
with open(input_file_abspath) as page:
doc = convert_text(
page.read(),
input_format='vimwiki',
standalone=True
)
# XXX: This should fall back to headers
title = doc.get_metadata('title')
if not title:
title = pagename
# This falls back to diary paths for date metadata:
date = doc.get_metadata('date')
if not date:
date_match_result = re.match(date_re, pagename)
if date_match_result:
# XXX: This needs to include timezone somehow or another
# THIS IS A DIRTY HACK, MAKE IT USE THE ACTUAL CURRENT TZ
date = date_match_result.group(1) + " 00:00:00.000000000-07:00"
# Log the name and metadata of the page:
c.execute("DELETE FROM pages WHERE page = ?", (pagename,))
c.execute("INSERT INTO pages VALUES (?, ?, ?)", (pagename, title, date))
# Clear any links from this page in case something's been deleted:
c.execute("DELETE FROM links WHERE page = ?", (pagename,))
conn.commit()
doc.walk(extract_values)
conn.close()