#!/home/brennen/python_bullshit/bin/python3
|
|
|
|
"""
|
|
This script collects metadata from VimWiki pages in my notes.
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import sys
|
|
import sqlite3
|
|
import re
|
|
from pprint import pprint
|
|
from panflute import *
|
|
|
|
def resolve_target(target, page):
|
|
"""Resolve the target of a link to a canonical name."""
|
|
|
|
if re.search('https?://|file:|tel:|mailto:', target):
|
|
return target
|
|
|
|
# Special case diary entries because Pandoc returns them as relative links.
|
|
# This will break if you're trying to link to foo/diary/2023-01-01 instead
|
|
# of /diary/2023-01-01, but at least within my notes that's not something
|
|
# I do, and it's probably a rare case in general.
|
|
if re.search('diary/\d{4}-\d{2}-\d{2}', target):
|
|
return target
|
|
|
|
# At this point, we should be fairly confident the link is a wiki page.
|
|
# Normalize it by removing the .html extension that pandoc throws on there:
|
|
if target.endswith('.html'):
|
|
target = os.path.splitext(target)[0]
|
|
|
|
# Check for an absolute path (within the wiki):
|
|
if target.startswith('/'):
|
|
# Note this might behave weirdly with the leading // thing, but I don't
|
|
# _think_ Pandoc supports that anyway:
|
|
return target.replace('/', '', 1)
|
|
|
|
page_elements = page.split('/')
|
|
# Get rid of the page name:
|
|
page_elements.pop()
|
|
|
|
path_elements = page_elements + target.split('/')
|
|
|
|
resolved_path = []
|
|
while len(path_elements) > 0:
|
|
element = path_elements.pop()
|
|
if element == '..' and len(path_elements) > 0:
|
|
# Discard a directory:
|
|
path_elements.pop()
|
|
else:
|
|
resolved_path.append(element)
|
|
|
|
resolved_path.reverse()
|
|
return '/'.join(resolved_path)
|
|
|
|
def extract_values(elem, doc):
|
|
"""Extract links from given document, write to sqlite."""
|
|
|
|
# pprint(elem)
|
|
if isinstance(elem, Link):
|
|
link_target = elem.url
|
|
|
|
# Skip in-page anchors, for now:
|
|
if link_target.startswith('#'):
|
|
return
|
|
|
|
# Insert a row of data
|
|
c.execute(
|
|
"INSERT OR IGNORE INTO links VALUES (?, ?)",
|
|
(
|
|
pagename,
|
|
resolve_target(link_target, pagename)
|
|
)
|
|
)
|
|
|
|
conn.commit()
|
|
|
|
# Ensure we're in the wiki directory:
|
|
notes_dir = os.path.join(os.getenv('HOME'), 'notes')
|
|
vimwiki_dir = os.path.join(notes_dir, 'vimwiki')
|
|
os.chdir(notes_dir)
|
|
|
|
conn = sqlite3.connect('metadata.db')
|
|
c = conn.cursor()
|
|
|
|
date_re = re.compile('diary/([0-9]{4}-[0-9]{2}-[0-9]{2})')
|
|
|
|
for input_file in sys.argv[1:]:
|
|
|
|
# Trim leading directory and .wiki extension:
|
|
# XXX: This fails badly if in a symlinked path
|
|
input_file_abspath = os.path.abspath(input_file)
|
|
pagename = input_file_abspath.replace(vimwiki_dir + os.sep, '', 1)
|
|
pagename = os.path.splitext(pagename)[0]
|
|
|
|
# XXX: do real logging
|
|
# print(pagename)
|
|
|
|
# _logscratch is for ephemeral renderings of logs for other pages - skip
|
|
# collecting any metadata for these pages:
|
|
if pagename.startswith('_logscratch'):
|
|
continue
|
|
|
|
with open(input_file_abspath) as page:
|
|
doc = convert_text(
|
|
page.read(),
|
|
input_format='vimwiki',
|
|
standalone=True
|
|
)
|
|
|
|
# XXX: This should fall back to headers
|
|
title = doc.get_metadata('title')
|
|
if not title:
|
|
title = pagename
|
|
|
|
# This falls back to diary paths for date metadata:
|
|
date = doc.get_metadata('date')
|
|
if not date:
|
|
date_match_result = re.match(date_re, pagename)
|
|
if date_match_result:
|
|
# XXX: This needs to include timezone somehow or another
|
|
# THIS IS A DIRTY HACK, MAKE IT USE THE ACTUAL CURRENT TZ
|
|
date = date_match_result.group(1) + " 00:00:00.000000000-07:00"
|
|
|
|
# Log the name and metadata of the page:
|
|
c.execute("DELETE FROM pages WHERE page = ?", (pagename,))
|
|
c.execute("INSERT INTO pages VALUES (?, ?, ?)", (pagename, title, date))
|
|
|
|
# Clear any links from this page in case something's been deleted:
|
|
c.execute("DELETE FROM links WHERE page = ?", (pagename,))
|
|
|
|
conn.commit()
|
|
|
|
doc.walk(extract_values)
|
|
|
|
conn.close()
|