""" hacky script to convert saved wordpress sites to markdown for use in https://github.com/expectocode/pagong """ import bs4 import os import sys import re from pathlib import Path import urllib.parse import dateutil.parser import shutil def header(tag_name): if m := re.match(r'h([1-6])', tag_name): return int(m[1]) def rewrite_img_src(src): if '//' in src: return src else: return src.split('/')[-1] def handle(tag, pre=False, list_ty=None): if isinstance(tag, bs4.NavigableString): tag = str(tag) if pre: yield tag else: value = re.sub(r'\s+', ' ', tag) if not value.isspace(): yield value return if tag.name == 'div': pass elif level := header(tag.name): yield '\n\n' + '#' * level + ' ' elif tag.name == 'p': pass elif tag.name == 'em': yield '_' elif tag.name == 'strong': yield '**' elif tag.name == 'a': yield '[' elif tag.name == 'code': if not pre: yield '`' elif tag.name == 'ul': list_ty = list_ty or [] list_ty.append(None) elif tag.name == 'li': if not list_ty[-1]: yield '\n* ' else: yield f'\n{list_ty[-1]}. ' list_ty[-1] += 1 elif tag.name == 'pre': pre = True yield '\n```\n' elif tag.name == 'figure': yield '\n' elif tag.name == 'img': yield f'![{tag["alt"]}]({rewrite_img_src(tag["src"])})' elif tag.name == 'hr': yield '\n\n----------\n\n' elif tag.name == 'ol': list_ty = list_ty or [] list_ty.append(1) elif tag.name == 'br': yield '\n' elif tag.name == 'table': # bruh i ain't gonna parse tables yield tag.prettify() return elif tag.name == 'blockquote': yield '\n> ' elif tag.name == 's': yield '~~' elif tag.name == 'figcaption': yield '\n_' elif tag.name == 'video': yield f'' elif tag.name == 'cite': yield f'-- ' elif tag.name in ('sub', 'sup'): yield f'<{tag.name}>' else: print('wtf is', tag.name) quit() for child in tag.children: yield from handle(child, pre=pre, list_ty=list_ty) if tag.name == 'div': pass elif header(tag.name): yield '\n\n' elif tag.name == 'p': yield '\n\n' elif tag.name == 'em': yield '_' elif tag.name == 'strong': yield '**' elif tag.name == 'a': yield f']({tag["href"]})' elif tag.name == 'code': if not pre: yield '`' elif tag.name == 'ul': list_ty.pop() yield '\n' elif tag.name == 'li': pass elif tag.name == 'pre': yield '\n```\n\n' elif tag.name == 'figure': yield '\n\n' elif tag.name == 'img': pass elif tag.name == 'hr': pass elif tag.name == 'ol': list_ty.pop() yield '\n' elif tag.name == 'br': pass elif tag.name == 'table': pass elif tag.name == 'blockquote': yield '\n' elif tag.name == 's': yield '~~' elif tag.name == 'figcaption': yield '_\n' elif tag.name == 'video': pass elif tag.name == 'cite': pass elif tag.name in ('sub', 'sup'): yield f'' def iter_local_img(file: Path, tag): if isinstance(tag, bs4.NavigableString): return if tag.name == 'img': src = tag["src"] if '//' not in src: f = file.parent / urllib.parse.unquote(src) if f.is_file(): yield f, rewrite_img_src(src) for child in tag.children: yield from iter_local_img(file, child) def main(): try: indir = Path(sys.argv[1]) outroot = Path(sys.argv[2]) except IndexError: print('usage:', sys.argv[0], '', '') exit(1) outroot.mkdir(exist_ok=True) for file in indir.iterdir(): if not file.is_file() or not file.name.endswith('.html'): continue with file.open(encoding='utf-8') as fd: soup = bs4.BeautifulSoup(fd.read(), 'html.parser') name = soup.find('link', rel='canonical') if name: name = name['href'] else: name = soup.find(id='cancel-comment-reply-link')['href'].split('#')[0] name = name.rstrip('/').split('/')[-1] outdir = outroot / name title = soup.find(class_='entry-title').text _author = soup.find(class_='entry-author').text # i'd rather not write this published = dateutil.parser.isoparse(soup.find(class_='published')['datetime']).replace(' ', 'T') # ISO 8601 updated = dateutil.parser.isoparse(soup.find(class_='updated')['datetime']).replace(' ', 'T') content = soup.find(class_='entry-content') outdir.mkdir(exist_ok=True) with open(outdir / 'post.md', 'w', encoding='utf-8') as fd: fd.write(f'''```meta title: {title} published: {published} updated: {updated} ``` ''') # hacky way to avoid the excessive amount of newlines except in pre blocks lines = ''.join(handle(content)).split('\n') pre = False empty = False for line in lines: if line.startswith('```'): fd.write(line) fd.write('\n') pre = not pre continue if not line or line.isspace(): empty = True else: if empty: fd.write('\n') empty = False fd.write(line) fd.write('\n') for src, dst in iter_local_img(file, content): shutil.copy(src, outdir / dst) main()