-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathwikicontent.py
313 lines (271 loc) · 11.8 KB
/
wikicontent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
"""
Methods for converting Mediawiki content to the Dokuwiki format.
Uses mwlib to parse the Mediawiki markup.
Copyright (C) 2014 Angus Gratton
Licensed under New BSD License as described in the file LICENSE.
"""
from __future__ import print_function, unicode_literals, absolute_import, division
import re, string, dokuwiki, visitor
from mwlib.parser import *
from mwlib import uparser
# Regex to match any known File: namespace (can be updated based on the mediawiki installation language)
mw_file_namespace_aliases = re.compile("^(Image|File):", re.IGNORECASE)
dw_file_namespace = "File:"
def set_file_namespaces(canonical_alias, aliases):
"""
Allow the mediawiki parser to match localised namespaces for files/images
Arguments:
canonical_alias is the single namespace that dokuwiki will use (default File:)
aliases is a list of alternative namespace names that will be converted to the canonical alias
"""
global mw_file_namespace_aliases
global dw_file_namespace
dw_file_namespace = canonical_alias + ":"
mw_file_namespace_aliases = re.compile("^(%s):" % "|".join(aliases), re.IGNORECASE)
def is_file_namespace(target):
"""
Is this target URL part of a known File or Image path?
"""
return re.match(mw_file_namespace_aliases, target)
def canonicalise_file_namespace(target):
"""
Convert any known File: or Image: (or alias) namespace link to be File:
... mediawiki stores all these under a common namespace, so dokuwiki has no choice but to import
them all under a single canonical
"""
return re.sub(mw_file_namespace_aliases, dw_file_namespace, target)
def convert_pagecontent(title, content):
"""
Convert a string in Mediawiki content format to a string in
Dokuwiki content format.
"""
# this is a hack for mwlib discarding the content of <nowiki> tags
# and replacing them with plaintext parsed HTML versions of the
# content (pragmatic, but not what we want)
nowiki_plaintext = []
# Instead we save the content here, replace it with the "magic" placeholder
# tag <__yamdwe_nowiki> and the index where the content was saved, then pass
# the list of nowiki content into the parser as context.
def add_nowiki_block(match):
nowiki_plaintext.append(match.group(0))
return "<__yamdwe_nowiki>%d</__yamdwe_nowiki>" % (len(nowiki_plaintext)-1,)
content = re.sub(r"<nowiki>.+?</nowiki>", add_nowiki_block, content)
root = uparser.parseString(title, content) # create parse tree
context = {}
context["list_stack"] = []
context["nowiki_plaintext"] = nowiki_plaintext # hacky way of attaching to child nodes
result = convert(root, context, False)
# mwlib doesn't parse NOTOC, so check for it manually
if re.match(r"^\s*__NOTOC__\s*$", content, re.MULTILINE):
result = "~~NOTOC~~"+("\n" if not result.startswith("\n") else "")+result
return result
def convert_children(node, context):
"""Walk the children of this parse node and call convert() on each.
"""
result = ""
for child in node.children:
res = convert(child, context, result.endswith("\n"))
if type(res) is str:
res = unicode(res)
if type(res) is not unicode:
print("Got invalid response '%s' when processing '%s'" % (res,child))
result += res
return result
@visitor.when(Article)
def convert(node, context, trailing_newline):
return convert_children(node, context)
@visitor.when(Paragraph)
def convert(node, context, trailing_newline):
return convert_children(node, context) + "\n"
@visitor.when(Text)
def convert(text, context, trailing_newline):
if text._text is None:
return ""
m = re.match(r"<__yamdwe_nowiki>([0-9]+)</__yamdwe_nowiki>", text._text)
if m is not None: # nowiki content!
index = int(m.group(1))
return context["nowiki_plaintext"][index] # nowiki_plaintext entry includes <nowiki> tags
else:
return text.caption
@visitor.when(Section)
def convert(section, context, trailing_newline):
result = ""
if section.tagname == "p":
pass
elif section.tagname == "@section":
level = section.level
heading = convert(section.children.pop(0), context, trailing_newline).strip()
heading_boundary = "="*(8-level)
result = "\n%s %s %s\n" % (heading_boundary, heading, heading_boundary)
else:
print("Unknown tagname %s" % section.tagname)
return result + convert_children(section, context)
@visitor.when(Style)
def convert(style, context, trailing_newline):
formatter = {
";" : ("**", r"**\\"), # definition (essentially boldface)
"''" : ("//", "//"), # italics
"'''" :("**", "**"), # boldface
":" : ("", ""), # other end of a definition???
"sub" : ("<sub>","</sub>"),
"sup" : ("<sup>","</sup>"),
"big" : ("**", "**"), # <big> not in dokuwiki so use bold
"-" : ("<blockquote>", "</blockquote>"), # use dokuwikis Blockquote Plugin for this
"u" : ("", ""), # <br> already handled in TagNode @visitor
"s" : ("<del>", "</del>") # According to the mediawiki docs <s>..</s> is synonymous with <del>...</del> (although one is treates as a tag and one a style in the parser??)
}.get(style.caption, None)
if formatter is None:
print("WARNING: Ignoring unknown formatter %s" % style.caption)
formatter = ("","")
return formatter[0] + convert_children(style, context) + formatter[1]
@visitor.when(NamedURL)
def convert(url, context, trailing_newline):
text = convert_children(url, context).strip(" ")
url = url.caption
if len(text):
return u"[[%s|%s]]" % (url, text)
else:
return u"%s" % (url)
@visitor.when(URL)
def convert(url, context, trailing_newline):
return url.caption
@visitor.when(ImageLink)
def convert(link, context, trailing_newline):
suffix = ""
if link.width is not None:
if link.height is None:
suffix = "?%s" % link.width
else:
suffix = "?%sx%s" % (link.width, link.height)
else:
try:
if link.in_gallery: # see below for Tag->gallery handling
suffix = "?160" # gallery images should be thumbnailed
except AttributeError:
pass # not in a gallery
prealign = " " if link.align in [ "center", "right" ] else ""
postalign = " " if link.align in [ "center", "left" ] else ""
target = canonicalise_file_namespace(link.target)
target = convert_internal_link(target)
return "{{%s%s%s%s}}" % (prealign, target, suffix, postalign)
@visitor.when(ArticleLink)
def convert(link, context, trailing_newline):
text = convert_children(link, context).strip(" ")
pagename = convert_internal_link(link.target)
if len(text):
return u"[[%s|%s]]" % (pagename, text)
else:
return u"[[%s]]" % pagename
@visitor.when(CategoryLink)
def convert(link, context, trailing_newline):
# Category functionality can be implemented with plugin:tag, but not used here
return ""
@visitor.when(NamespaceLink)
def convert(link, context, trailing_newline):
if is_file_namespace(link.target): # is a link to a file or image
filename = dokuwiki.make_dokuwiki_pagename(canonicalise_file_namespace(link.target))
caption = convert_children(link, context).strip()
if len(caption) > 0:
return u"{{%s%s}}" % (filename, caption)
else:
return u"{{%s}}" % filename
print("WARNING: Ignoring namespace link to " + link.target)
return convert_children(link, context)
@visitor.when(ItemList)
def convert(itemlist, context, trailing_newline):
context["list_stack"].append("* " if itemlist.tagname == "ul" else "- ")
converted_list = convert_children(itemlist, context)
context["list_stack"].pop()
return converted_list
@visitor.when(Item)
def convert(item, context, trailing_newline):
item_content = convert_children(item, context)
list_stack = context["list_stack"]
return " "*len(list_stack) + list_stack[-1] + item_content
@visitor.when(Table)
def convert(table, context, trailing_newline):
# we ignore the actual Table tags, instead convert each Row & Cell individually
return convert_children(table, context)
@visitor.when(Cell)
def convert(cell, context, trailing_newline):
marker = "^" if cell.tagname == "th" else "|"
result = u"%s %s" % (marker, convert_children(cell, context).replace('\n','').strip())
return result
@visitor.when(Row)
def convert(row, context, trailing_newline):
return convert_children(row, context) + " |\n"
@visitor.when(PreFormatted)
def convert(pre, context, trailing_newline):
in_list = len(context["list_stack"]) > 0
if trailing_newline and not in_list: # in its own paragraph, use a two space indent
return " " + convert_children(pre, context).replace("\n","\n ").strip(" ")
else: # inline in a list or a paragraph body, use <code> tags
return "<code>" + convert_children(pre, context) + "</code>"
@visitor.when(TagNode)
def convert(tag, context, trailing_newline):
# dict maps mediawiki tag name to tuple of starting, ending dokuwiki tag
simple_tagitems = {
"tt" : ("''", "''"),
"ref" : ("((","))"), # references converted to footnotes
"code" : ("<code>","</code>"),
"del": ("<del>", "</del>"),
}
if tag.tagname in simple_tagitems:
pre,post = simple_tagitems[tag.tagname]
return pre + convert_children(tag, context) + post
elif tag._text is not None:
if tag._text.replace(" ","").replace("/","") == "<br>":
return "\n" # this is a oneoff hack for one wiki page covered in <br/>
return tag._text # may not work for non-selfclosing tags
elif tag.tagname == "gallery":
# with a lot of cleverness we could use the gallery plugin for this,
# but this should do. We flag each child image as in the gallery, then
# deal with the ImageLinks above
for child in tag.children:
child.in_gallery = True
elif tag.tagname == "references":
print("WARNING: <references> tag has no equivalent in Dokuwiki, ignoring...")
return convert_children(tag, context)
@visitor.when(Math)
def convert(node, context, trailing_newline):
"""
Convert <math></math> tags for rendering of math terms
there are a couple of extension to support this in dokuwiki
tested successfully with MathJax plugin
"""
if "\n" in node.math:
# multiple lines are a formula block
return "$$" + node.math + "$$"
# anything else is inline term
return "$" + node.math + "$"
@visitor.when(Caption)
def convert(node, context, trailing_newline):
"""
Convert table captions to bold paragraph preceeding the table.
Because we ignore the <table> tags when converting to dokuwiki,
we can get away with simply converting to bold text without
worrying about it being inside <table> (which <caption> should be)
in the rendered HTML.
"""
return "** %s **\n" % convert_children(node, context)
# catchall for Node, which is the parent class of everything above
@visitor.when(Node)
def convert(node, context, trailing_newline):
if node.__class__ != Node:
print("WARNING: Unsupported node type: %s" % (node.__class__))
return convert_children(node, context)
def convert_internal_link(mw_target):
"""
Convert an internal Mediawiki link, with or without an anchor # in the middle.
Same as converting a plain pagename, only we want to preserve any #s in the target text.
"""
if "#" in mw_target:
page,anchor = mw_target.split("#",1)
else:
page = mw_target
anchor = None
if len(page):
page = dokuwiki.make_dokuwiki_pagename(page)
if anchor is not None:
page = page + "#" + dokuwiki.make_dokuwiki_heading_id(anchor)
return page