- #!/home/brennen/python_bullshit/bin/python3
-
- """
- This script collects metadata from VimWiki pages in my notes.
- """
-
- import os
- import re
- import sys
- import sqlite3
- import re
- from pprint import pprint
- from panflute import *
-
- def resolve_target(target, page):
- """Resolve the target of a link to a canonical name."""
-
- if re.search('https?://|file:|tel:|mailto:', target):
- return target
-
- # Special case diary entries because Pandoc returns them as relative links.
- # This will break if you're trying to link to foo/diary/2023-01-01 instead
- # of /diary/2023-01-01, but at least within my notes that's not something
- # I do, and it's probably a rare case in general.
- if re.search('diary/\d{4}-\d{2}-\d{2}', target):
- return target
-
- # At this point, we should be fairly confident the link is a wiki page.
- # Normalize it by removing the .html extension that pandoc throws on there:
- if target.endswith('.html'):
- target = os.path.splitext(target)[0]
-
- # Check for an absolute path (within the wiki):
- if target.startswith('/'):
- # Note this might behave weirdly with the leading // thing, but I don't
- # _think_ Pandoc supports that anyway:
- return target.replace('/', '', 1)
-
- page_elements = page.split('/')
- # Get rid of the page name:
- page_elements.pop()
-
- path_elements = page_elements + target.split('/')
-
- resolved_path = []
- while len(path_elements) > 0:
- element = path_elements.pop()
- if element == '..' and len(path_elements) > 0:
- # Discard a directory:
- path_elements.pop()
- else:
- resolved_path.append(element)
-
- resolved_path.reverse()
- return '/'.join(resolved_path)
-
- def extract_values(elem, doc):
- """Extract links from given document, write to sqlite."""
-
- # pprint(elem)
- if isinstance(elem, Link):
- link_target = elem.url
-
- # Skip in-page anchors, for now:
- if link_target.startswith('#'):
- return
-
- # Insert a row of data
- c.execute(
- "INSERT OR IGNORE INTO links VALUES (?, ?)",
- (
- pagename,
- resolve_target(link_target, pagename)
- )
- )
-
- conn.commit()
-
- # Ensure we're in the wiki directory:
- notes_dir = os.path.join(os.getenv('HOME'), 'notes')
- vimwiki_dir = os.path.join(notes_dir, 'vimwiki')
- os.chdir(notes_dir)
-
- conn = sqlite3.connect('metadata.db')
- c = conn.cursor()
-
- date_re = re.compile('diary/([0-9]{4}-[0-9]{2}-[0-9]{2})')
-
- for input_file in sys.argv[1:]:
-
- # Trim leading directory and .wiki extension:
- # XXX: This fails badly if in a symlinked path
- input_file_abspath = os.path.abspath(input_file)
- pagename = input_file_abspath.replace(vimwiki_dir + os.sep, '', 1)
- pagename = os.path.splitext(pagename)[0]
-
- # XXX: do real logging
- # print(pagename)
-
- # _logscratch is for ephemeral renderings of logs for other pages - skip
- # collecting any metadata for these pages:
- if pagename.startswith('_logscratch'):
- continue
-
- with open(input_file_abspath) as page:
- doc = convert_text(
- page.read(),
- input_format='vimwiki',
- standalone=True
- )
-
- # XXX: This should fall back to headers
- title = doc.get_metadata('title')
- if not title:
- title = pagename
-
- # This falls back to diary paths for date metadata:
- date = doc.get_metadata('date')
- if not date:
- date_match_result = re.match(date_re, pagename)
- if date_match_result:
- # XXX: This needs to include timezone somehow or another
- # THIS IS A DIRTY HACK, MAKE IT USE THE ACTUAL CURRENT TZ
- date = date_match_result.group(1) + " 00:00:00.000000000-07:00"
-
- # Log the name and metadata of the page:
- c.execute("DELETE FROM pages WHERE page = ?", (pagename,))
- c.execute("INSERT INTO pages VALUES (?, ?, ?)", (pagename, title, date))
-
- # Clear any links from this page in case something's been deleted:
- c.execute("DELETE FROM links WHERE page = ?", (pagename,))
-
- conn.commit()
-
- doc.walk(extract_values)
-
- conn.close()
|