Dotfiles, utilities, and other apparatus.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

137 lines
4.1 KiB

  1. #!/usr/bin/env python3
  2. """
  3. This script collects metadata from VimWiki pages in my notes.
  4. """
  5. import os
  6. import re
  7. import sys
  8. import sqlite3
  9. import re
  10. from pprint import pprint
  11. from panflute import *
  12. def resolve_target(target, page):
  13. """Resolve the target of a link to a canonical name."""
  14. if re.search('https?://|file:|tel:|mailto:', target):
  15. return target
  16. # Special case diary entries because Pandoc returns them as relative links.
  17. # This will break if you're trying to link to foo/diary/2023-01-01 instead
  18. # of /diary/2023-01-01, but at least within my notes that's not something
  19. # I do, and it's probably a rare case in general.
  20. if re.search('diary/\d{4}-\d{2}-\d{2}', target):
  21. return target
  22. # At this point, we should be fairly confident the link is a wiki page.
  23. # Normalize it by removing the .html extension that pandoc throws on there:
  24. if target.endswith('.html'):
  25. target = os.path.splitext(target)[0]
  26. # Check for an absolute path (within the wiki):
  27. if target.startswith('/'):
  28. # Note this might behave weirdly with the leading // thing, but I don't
  29. # _think_ Pandoc supports that anyway:
  30. return target.replace('/', '', 1)
  31. page_elements = page.split('/')
  32. # Get rid of the page name:
  33. page_elements.pop()
  34. path_elements = page_elements + target.split('/')
  35. resolved_path = []
  36. while len(path_elements) > 0:
  37. element = path_elements.pop()
  38. if element == '..' and len(path_elements) > 0:
  39. # Discard a directory:
  40. path_elements.pop()
  41. else:
  42. resolved_path.append(element)
  43. resolved_path.reverse()
  44. return '/'.join(resolved_path)
  45. def extract_values(elem, doc):
  46. """Extract links from given document, write to sqlite."""
  47. # pprint(elem)
  48. if isinstance(elem, Link):
  49. link_target = elem.url
  50. # Skip in-page anchors, for now:
  51. if link_target.startswith('#'):
  52. return
  53. # Insert a row of data
  54. c.execute(
  55. "INSERT OR IGNORE INTO links VALUES (?, ?)",
  56. (
  57. pagename,
  58. resolve_target(link_target, pagename)
  59. )
  60. )
  61. conn.commit()
  62. # Ensure we're in the wiki directory:
  63. notes_dir = os.path.join(os.getenv('HOME'), 'notes')
  64. vimwiki_dir = os.path.join(notes_dir, 'vimwiki')
  65. os.chdir(notes_dir)
  66. conn = sqlite3.connect('metadata.db')
  67. c = conn.cursor()
  68. date_re = re.compile('diary/([0-9]{4}-[0-9]{2}-[0-9]{2})')
  69. for input_file in sys.argv[1:]:
  70. # Trim leading directory and .wiki extension:
  71. # XXX: This fails badly if in a symlinked path
  72. input_file_abspath = os.path.abspath(input_file)
  73. pagename = input_file_abspath.replace(vimwiki_dir + os.sep, '', 1)
  74. pagename = os.path.splitext(pagename)[0]
  75. # XXX: do real logging
  76. # print(pagename)
  77. # _logscratch is for ephemeral renderings of logs for other pages - skip
  78. # collecting any metadata for these pages:
  79. if pagename.startswith('_logscratch'):
  80. continue
  81. with open(input_file_abspath) as page:
  82. doc = convert_text(
  83. page.read(),
  84. input_format='vimwiki',
  85. standalone=True
  86. )
  87. # XXX: This should fall back to headers
  88. title = doc.get_metadata('title')
  89. if not title:
  90. title = pagename
  91. # This falls back to diary paths for date metadata:
  92. date = doc.get_metadata('date')
  93. if not date:
  94. date_match_result = re.match(date_re, pagename)
  95. if date_match_result:
  96. # XXX: This needs to include timezone somehow or another
  97. # THIS IS A DIRTY HACK, MAKE IT USE THE ACTUAL CURRENT TZ
  98. date = date_match_result.group(1) + " 00:00:00.000000000-07:00"
  99. # Log the name and metadata of the page:
  100. c.execute("DELETE FROM pages WHERE page = ?", (pagename,))
  101. c.execute("INSERT INTO pages VALUES (?, ?, ?)", (pagename, title, date))
  102. # Clear any links from this page in case something's been deleted:
  103. c.execute("DELETE FROM links WHERE page = ?", (pagename,))
  104. conn.commit()
  105. doc.walk(extract_values)
  106. conn.close()