Perspective to DokuWiki conversion

I've migrated my company's Perspective wiki to dokuwiki. The Perspective wiki had dozens of collections and editors and attachments and hundreds of pages, and the importing script worked reasonably well. Perspective is a Windows hosted wiki, so this is a Windows Python script. (The path separators are hardcoded.)

ToDo: This script creates an “imported.log” file with the same format as “changes.log” But it can't be merged into changes.log as is. Andreas points out that, “The ID and the lastmod timestamp needs to match. So first create the file then get it's timestamp and save that one to the changelog.”

I found that the script was saving pages that originally had spaces in the name without spaces. The dokuwiki behavior is to save with an underscore in place of each space. To fix this replace this code:

name = GetNode(fields, "name", "")
                doku_namespace = ""
                doku_name = ""
                if name.find(":") != -1:
                    doku_namespace, doku_name = name.split(":")
                    doku_namespace = "\\" + doku_namespace
                    doku_name = name

with this code:

doku_namespace = ""
doku_name = ""
doku_namespace = "\\" + GetNode(fields, "name", "page.collection")
doku_name = GetNode(fields, "name", "page.display-name")

This will also allow you to search for a full page name with spaces in the search term instead of only being able to find the page by searching for the full name without spaces or one word from the page title.


  • Python v2
  • Windows Share access to the source and destination wikis.
# ConvertPerspectiveToDokuwiki
# By: David Blume
# This was a quick hack, but it works reasonably well.
import sys
import os
import time
from xml.dom import minidom
doku_dir = ""
def CopyAttachments(attachments, path, doku_page, doku_namespace):
    for attachment in attachments.childNodes:
        if attachment.nodeName == u'attachment':
            version = attachment.getElementsByTagName("version")[0]
            name = attachment.getElementsByTagName("name")[0]
            doku_name = name.lower().replace(" ", "_")
            doku_media_path = doku_dir + "\\media" + doku_namespace
            if not os.path.exists(doku_media_path):
            os.popen4("copy \"" + path + "\\" + version + "-attachments\\" + name + "\" \"" + doku_media_path + "\\" + doku_name + "\"")
            if len(doku_namespace):
                name = doku_namespace[1:] + ":" + name
            if name not in attached_images:
                doku_page.write("\nAutomatically Attached : {{" + name + "}}\n")
doku_list_types = []
attached_images = []
changes_log = None
def ParsePage(node, doku_page, doku_namespace, doku_name):
    global doku_list_types
    global attached_images
    if node.nodeType == minidom.Node.TEXT_NODE:
        s = node.nodeValue.lstrip()
        if s:
            s = s.replace("&", "&")
            s = s.replace("&lt;", "<")
            s = s.replace("&gt;", ">")
            s = s.replace(u'\u201c', "\"")
            s = s.replace(u'\u201d', "\"")
            s = s.replace(u'\xb4', "'")
            s = s.replace(u'\u2019', "'")
            s = s.replace(u'\u2013', "-")
            s = s.replace(u'\u2022', ".")
            s = s.replace(u'\u2026', "...")
            s = s.replace(u'\u2018', "'")
            s = s.replace(u'\xb7', "*")
            s = s.replace("**", "<nowiki>**</nowiki>")
    node_name = node.nodeName
    if node_name == u'img':
        src = node.getAttribute("src")
        src_name = src[src.find("name=") + 5:]
        if len(doku_namespace):
            src_name = doku_namespace[1:] + ":" + src_name
        doku_page.write("{{" + src_name + "}}")
    if node_name == u'link':
        link_dest = node.getElementsByTagName("name")[0]
        if link_dest[0] == ':' and len(doku_namespace):
            link_dest = link_dest[1:]
        doku_page.write("[[" + link_dest + "|" + node.getElementsByTagName("anchor")[0] + "]]")
    doku_list_pop = False
    bold = False
    italics = False
    underline = False
    anchor = False
    preformatted = False
    line_item = False
    header = ""
    if node_name == u'a':
        doku_page.write("[[" + node.getAttribute("href") + "|")
        anchor = True
    if node_name == u'ol':
        doku_list_types += "o"
        doku_list_pop = True
    if node_name == u'ul':
        doku_list_types += "u"
        doku_list_pop = True
    if node_name == u'li':
        doku_page.write("  " * len(doku_list_types))
        doku_page.write(doku_list_types[-1] == 'u' and "* " or "- ")
        line_item = True
    # What about tables?  How are they done?
    if node_name == u'h1':
        header = "======"
        doku_page.write(header + " ")
    if node_name == u'h2':
        header = "====="
        doku_page.write(header + " ")
    if node_name == u'h3':
        header = "===="
        doku_page.write(header + " ")
    if node_name == u'h4':
        header = "==="
        doku_page.write(header + " ")
    if node_name == u'h5':
        header = "=="
        doku_page.write(header + " ")
    if node_name == u'span':
        style = node.getAttribute("style")
        if style:
            if style.find("font-weight:bold;") != -1:
                bold = True
            if style.find("text-decoration:underline;") != -1:
                underline = True
            if style.find("font-style:italic;") != -1:
                italics = True
    if node_name == u'div':
        style = node.getAttribute("style")
        if style:
            if style.find("margin-left:40px;") != -1:
                # Maybe one day we'll support indentation
    if node_name == u'pre':
        preformatted = True
    for subnode in node.childNodes:
        ParsePage(subnode, doku_page, doku_namespace, doku_name)
    if preformatted:
    if italics:
    if underline:
    if bold:
    if doku_list_pop:
    if len(header):
        doku_page.write(" " + header + "\n")
    if anchor:
    if line_item:
    if node_name == u'p' and len(doku_list_types) == 0:
    if (node_name == u'br' or node_name == u'div') and len(doku_list_types) == 0:
        doku_page.write("\\\\ \n")
def GetNode(nodes, attribute, name):
    for n in nodes:
        if n.getAttribute(attribute) == name:
            return n        
def Walk(path):
    global changes_log
    global attached_images
    for filename in os.listdir(path):
        fullpath = path + "\\" + filename
        if os.path.isdir(fullpath):
            if filename.endswith(".page"):
                # Parse this page
                # Get the revision from "latest.txt"
                print "Parsing " + fullpath + "..."
                version = -1
                    version = file(fullpath + "\\latest.txt", "r").read().strip()
                    print "WARNING: " + filename + " does not have a latest version."
                source = minidom.parse(fullpath + "\\versions\\" + version + ".xml")
                assert source.documentElement.tagName == "page-data"
                fields = source.documentElement.getElementsByTagName("field")
                name = GetNode(fields, "name", "")
                doku_namespace = ""
                doku_name = ""
                if name.find(":") != -1:
                    doku_namespace, doku_name = name.split(":")
                    doku_namespace = "\\" + doku_namespace
                    doku_name = name
                doku_namespace = doku_namespace.lower().replace(" ", "_")
                doku_name = doku_name.lower().replace(" ", "_")
                if not os.path.isdir(doku_dir + "\\pages" + doku_namespace):
                    os.mkdir(doku_dir + "\\pages" + doku_namespace)
                doku_page = file(doku_dir + "\\pages" + doku_namespace + "\\" + doku_name + ".txt", 'w')
                user = GetNode(fields, "name", "page.last-edit-username")
                date = GetNode(fields, "name", "page.last-edit-server-time")
                date = str(int(time.mktime(time.strptime(date, '%d/%b/%y %H:%M:%S'))))
                page = GetNode(fields, "name", "page.contents")
                attached_images = []
                ParsePage(page, doku_page, doku_namespace, doku_name)
                attachments = GetNode(fields, "name", "page.attachments")
                if attachments:
                    CopyAttachments(attachments, fullpath + "\\versions", doku_page, doku_namespace)
                changes_log.write("\t".join([date, "", doku_namespace[1:] + ":" + doku_name, user[user.find(":")+1:], "imported"]) + '\n')
def main(args):
    source_dir = ""
    global doku_dir
    global changes_log
    if len(args) == 2:
        source_dir = args[0]
        doku_dir = args[1] 
        doku_dir += "\\data"
        changes_log = file(doku_dir + "\\imported.log", 'wb')
    print "Finished parsing " + source_dir
if __name__=='__main__':
    if len(sys.argv) == 3:
        print "usage: " + sys.argv[0] + " perspective_dir dokuwiki_dir"
        print "Where the perspective_dir contains the .col (collection) directories, and"
        print "the dokuwiki_dir contains the dokuwiki's data directory as a child directory."
