content/blog/wptomd.py (view raw)
1"""
2hacky script to convert saved wordpress sites to markdown for use in https://github.com/expectocode/pagong
3"""
4import bs4
5import os
6import sys
7import re
8from pathlib import Path
9import urllib.parse
10import dateutil.parser
11import shutil
12
13def header(tag_name):
14 if m := re.match(r'h([1-6])', tag_name):
15 return int(m[1])
16
17def rewrite_img_src(src):
18 if '//' in src:
19 return src
20 else:
21 return src.split('/')[-1]
22
23def handle(tag, pre=False, list_ty=None):
24 if isinstance(tag, bs4.NavigableString):
25 tag = str(tag)
26 if pre:
27 yield tag
28 else:
29 value = re.sub(r'\s+', ' ', tag)
30 if not value.isspace():
31 yield value
32 return
33
34 if tag.name == 'div':
35 pass
36 elif level := header(tag.name):
37 yield '\n\n' + '#' * level + ' '
38 elif tag.name == 'p':
39 pass
40 elif tag.name == 'em':
41 yield '_'
42 elif tag.name == 'strong':
43 yield '**'
44 elif tag.name == 'a':
45 yield '['
46 elif tag.name == 'code':
47 if not pre:
48 yield '`'
49 elif tag.name == 'ul':
50 list_ty = list_ty or []
51 list_ty.append(None)
52 elif tag.name == 'li':
53 if not list_ty[-1]:
54 yield '\n* '
55 else:
56 yield f'\n{list_ty[-1]}. '
57 list_ty[-1] += 1
58 elif tag.name == 'pre':
59 pre = True
60 yield '\n```\n'
61 elif tag.name == 'figure':
62 yield '\n'
63 elif tag.name == 'img':
64 yield f'![{tag["alt"]}]({rewrite_img_src(tag["src"])})'
65 elif tag.name == 'hr':
66 yield '\n\n----------\n\n'
67 elif tag.name == 'ol':
68 list_ty = list_ty or []
69 list_ty.append(1)
70 elif tag.name == 'br':
71 yield '\n'
72 elif tag.name == 'table':
73 # bruh i ain't gonna parse tables
74 yield tag.prettify()
75 return
76 elif tag.name == 'blockquote':
77 yield '\n> '
78 elif tag.name == 's':
79 yield '~~'
80 elif tag.name == 'figcaption':
81 yield '\n_'
82 elif tag.name == 'video':
83 yield f'<video controls="controls" src="{rewrite_img_src(tag["src"])}"></video>'
84 elif tag.name == 'cite':
85 yield f'-- '
86 elif tag.name in ('sub', 'sup'):
87 yield f'<{tag.name}>'
88 else:
89 print('wtf is', tag.name)
90 quit()
91
92 for child in tag.children:
93 yield from handle(child, pre=pre, list_ty=list_ty)
94
95 if tag.name == 'div':
96 pass
97 elif header(tag.name):
98 yield '\n\n'
99 elif tag.name == 'p':
100 yield '\n\n'
101 elif tag.name == 'em':
102 yield '_'
103 elif tag.name == 'strong':
104 yield '**'
105 elif tag.name == 'a':
106 yield f']({tag["href"]})'
107 elif tag.name == 'code':
108 if not pre:
109 yield '`'
110 elif tag.name == 'ul':
111 list_ty.pop()
112 yield '\n'
113 elif tag.name == 'li':
114 pass
115 elif tag.name == 'pre':
116 yield '\n```\n\n'
117 elif tag.name == 'figure':
118 yield '\n\n'
119 elif tag.name == 'img':
120 pass
121 elif tag.name == 'hr':
122 pass
123 elif tag.name == 'ol':
124 list_ty.pop()
125 yield '\n'
126 elif tag.name == 'br':
127 pass
128 elif tag.name == 'table':
129 pass
130 elif tag.name == 'blockquote':
131 yield '\n'
132 elif tag.name == 's':
133 yield '~~'
134 elif tag.name == 'figcaption':
135 yield '_\n'
136 elif tag.name == 'video':
137 pass
138 elif tag.name == 'cite':
139 pass
140 elif tag.name in ('sub', 'sup'):
141 yield f'</{tag.name}>'
142
143
144def iter_local_img(file: Path, tag):
145 if isinstance(tag, bs4.NavigableString):
146 return
147
148 if tag.name == 'img':
149 src = tag["src"]
150 if '//' not in src:
151 f = file.parent / urllib.parse.unquote(src)
152 if f.is_file():
153 yield f, rewrite_img_src(src)
154
155 for child in tag.children:
156 yield from iter_local_img(file, child)
157
158
159def main():
160 try:
161 indir = Path(sys.argv[1])
162 outroot = Path(sys.argv[2])
163 except IndexError:
164 print('usage:', sys.argv[0], '<IN DIR>', '<OUT DIR>')
165 exit(1)
166
167 outroot.mkdir(exist_ok=True)
168
169 for file in indir.iterdir():
170 if not file.is_file() or not file.name.endswith('.html'):
171 continue
172
173 with file.open(encoding='utf-8') as fd:
174 soup = bs4.BeautifulSoup(fd.read(), 'html.parser')
175
176 name = soup.find('link', rel='canonical')
177 if name:
178 name = name['href']
179 else:
180 name = soup.find(id='cancel-comment-reply-link')['href'].split('#')[0]
181 name = name.rstrip('/').split('/')[-1]
182
183 outdir = outroot / name
184 title = soup.find(class_='entry-title').text
185 _author = soup.find(class_='entry-author').text # i'd rather not write this
186 published = dateutil.parser.isoparse(soup.find(class_='published')['datetime']).replace(' ', 'T') # ISO 8601
187 updated = dateutil.parser.isoparse(soup.find(class_='updated')['datetime']).replace(' ', 'T')
188 content = soup.find(class_='entry-content')
189
190 outdir.mkdir(exist_ok=True)
191 with open(outdir / 'post.md', 'w', encoding='utf-8') as fd:
192 fd.write(f'''```meta
193title: {title}
194published: {published}
195updated: {updated}
196```
197''')
198
199 # hacky way to avoid the excessive amount of newlines except in pre blocks
200 lines = ''.join(handle(content)).split('\n')
201 pre = False
202 empty = False
203 for line in lines:
204 if line.startswith('```'):
205 fd.write(line)
206 fd.write('\n')
207 pre = not pre
208 continue
209
210 if not line or line.isspace():
211 empty = True
212 else:
213 if empty:
214 fd.write('\n')
215 empty = False
216 fd.write(line)
217 fd.write('\n')
218
219 for src, dst in iter_local_img(file, content):
220 shutil.copy(src, outdir / dst)
221
222main()