git rekt — gemini-redirect.git (a59ca1351120174591b7ba9c8808cb11a27d2dfb): content/blog/wptomd.py

content/blog/wptomd.py (view raw)
  1"""
  2hacky script to convert saved wordpress sites to markdown for use in https://github.com/expectocode/pagong
  3"""
  4import bs4
  5import os
  6import sys
  7import re
  8from pathlib import Path
  9import urllib.parse
 10import dateutil.parser
 11import shutil
 12
 13def header(tag_name):
 14    if m := re.match(r'h([1-6])', tag_name):
 15        return int(m[1])
 16
 17def rewrite_img_src(src):
 18    if '//' in src:
 19        return src
 20    else:
 21        return src.split('/')[-1]
 22
 23def handle(tag, pre=False, list_ty=None):
 24    if isinstance(tag, bs4.NavigableString):
 25        tag = str(tag)
 26        if pre:
 27            yield tag
 28        else:
 29            value = re.sub(r'\s+', ' ', tag)
 30            if not value.isspace():
 31                yield value
 32        return
 33
 34    if tag.name == 'div':
 35        pass
 36    elif level := header(tag.name):
 37        yield '\n\n' + '#' * level + ' '
 38    elif tag.name == 'p':
 39        pass
 40    elif tag.name == 'em':
 41        yield '_'
 42    elif tag.name == 'strong':
 43        yield '**'
 44    elif tag.name == 'a':
 45        yield '['
 46    elif tag.name == 'code':
 47        if not pre:
 48            yield '`'
 49    elif tag.name == 'ul':
 50        list_ty = list_ty or []
 51        list_ty.append(None)
 52    elif tag.name == 'li':
 53        if not list_ty[-1]:
 54            yield '\n* '
 55        else:
 56            yield f'\n{list_ty[-1]}. '
 57            list_ty[-1] += 1
 58    elif tag.name == 'pre':
 59        pre = True
 60        yield '\n```\n'
 61    elif tag.name == 'figure':
 62        yield '\n'
 63    elif tag.name == 'img':
 64        yield f'![{tag["alt"]}]({rewrite_img_src(tag["src"])})'
 65    elif tag.name == 'hr':
 66        yield '\n\n----------\n\n'
 67    elif tag.name == 'ol':
 68        list_ty = list_ty or []
 69        list_ty.append(1)
 70    elif tag.name == 'br':
 71        yield '\n'
 72    elif tag.name == 'table':
 73        # bruh i ain't gonna parse tables
 74        yield tag.prettify()
 75        return
 76    elif tag.name == 'blockquote':
 77        yield '\n> '
 78    elif tag.name == 's':
 79        yield '~~'
 80    elif tag.name == 'figcaption':
 81        yield '\n_'
 82    elif tag.name == 'video':
 83        yield f'<video controls="controls" src="{rewrite_img_src(tag["src"])}"></video>'
 84    elif tag.name == 'cite':
 85        yield f'-- '
 86    elif tag.name in ('sub', 'sup'):
 87        yield f'<{tag.name}>'
 88    else:
 89        print('wtf is', tag.name)
 90        quit()
 91
 92    for child in tag.children:
 93        yield from handle(child, pre=pre, list_ty=list_ty)
 94
 95    if tag.name == 'div':
 96        pass
 97    elif header(tag.name):
 98        yield '\n\n'
 99    elif tag.name == 'p':
100        yield '\n\n'
101    elif tag.name == 'em':
102        yield '_'
103    elif tag.name == 'strong':
104        yield '**'
105    elif tag.name == 'a':
106        yield f']({tag["href"]})'
107    elif tag.name == 'code':
108        if not pre:
109            yield '`'
110    elif tag.name == 'ul':
111        list_ty.pop()
112        yield '\n'
113    elif tag.name == 'li':
114        pass
115    elif tag.name == 'pre':
116        yield '\n```\n\n'
117    elif tag.name == 'figure':
118        yield '\n\n'
119    elif tag.name == 'img':
120        pass
121    elif tag.name == 'hr':
122        pass
123    elif tag.name == 'ol':
124        list_ty.pop()
125        yield '\n'
126    elif tag.name == 'br':
127        pass
128    elif tag.name == 'table':
129        pass
130    elif tag.name == 'blockquote':
131        yield '\n'
132    elif tag.name == 's':
133        yield '~~'
134    elif tag.name == 'figcaption':
135        yield '_\n'
136    elif tag.name == 'video':
137        pass
138    elif tag.name == 'cite':
139        pass
140    elif tag.name in ('sub', 'sup'):
141        yield f'</{tag.name}>'
142
143
144def iter_local_img(file: Path, tag):
145    if isinstance(tag, bs4.NavigableString):
146        return
147
148    if tag.name == 'img':
149        src = tag["src"]
150        if '//' not in src:
151            f = file.parent / urllib.parse.unquote(src)
152            if f.is_file():
153                yield f, rewrite_img_src(src)
154
155    for child in tag.children:
156        yield from iter_local_img(file, child)
157
158
159def main():
160    try:
161        indir = Path(sys.argv[1])
162        outroot = Path(sys.argv[2])
163    except IndexError:
164        print('usage:', sys.argv[0], '<IN DIR>', '<OUT DIR>')
165        exit(1)
166
167    outroot.mkdir(exist_ok=True)
168
169    for file in indir.iterdir():
170        if not file.is_file() or not file.name.endswith('.html'):
171            continue
172
173        with file.open(encoding='utf-8') as fd:
174            soup = bs4.BeautifulSoup(fd.read(), 'html.parser')
175
176        name = soup.find('link', rel='canonical')
177        if name:
178            name = name['href']
179        else:
180            name = soup.find(id='cancel-comment-reply-link')['href'].split('#')[0]
181        name = name.rstrip('/').split('/')[-1]
182
183        outdir = outroot / name
184        title = soup.find(class_='entry-title').text
185        _author = soup.find(class_='entry-author').text  # i'd rather not write this
186        published = dateutil.parser.isoparse(soup.find(class_='published')['datetime']).replace(' ', 'T')  # ISO 8601
187        updated = dateutil.parser.isoparse(soup.find(class_='updated')['datetime']).replace(' ', 'T')
188        content = soup.find(class_='entry-content')
189
190        outdir.mkdir(exist_ok=True)
191        with open(outdir / 'post.md', 'w', encoding='utf-8') as fd:
192            fd.write(f'''```meta
193title: {title}
194published: {published}
195updated: {updated}
196```
197''')
198
199            # hacky way to avoid the excessive amount of newlines except in pre blocks
200            lines = ''.join(handle(content)).split('\n')
201            pre = False
202            empty = False
203            for line in lines:
204                if line.startswith('```'):
205                    fd.write(line)
206                    fd.write('\n')
207                    pre = not pre
208                    continue
209
210                if not line or line.isspace():
211                    empty = True
212                else:
213                    if empty:
214                        fd.write('\n')
215                        empty = False
216                    fd.write(line)
217                    fd.write('\n')
218
219        for src, dst in iter_local_img(file, content):
220            shutil.copy(src, outdir / dst)
221
222main()