Zwiki2Dokuwiki is a perl script to translate HTML pages from Zwiki to DokuWiki Zwiki2DokuWiki Home Page

Extract HTML Page

The HTML pages were extract with w3mir

Launch Zwiki2DokuWiki

#  Copyright (C) 2006 foxmask (foxmask at gmail dot com)
#   version 0.2 
#  Zwiki2Dokuwiki is free software; you can redistribute it and/or modify it
#  under the terms of the GNU General Public License as published
#  by the Free Software Foundation; either version 2 of the License,
#  or (at your option) any later version.
#  Zwiki2DokuWiki is distributed in the hope that it will be useful, but
#  WITHOUT ANY WARRANTY; without even the implied warranty of
#  GNU General Public License for more details.
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 59 Temple Place, Suite 330, Boston,
#  MA  02111-1307  USA
# This script will 
# 1 - read a given directory
# 2 - copy each file foobar to foobar.tmp 
# 3 - eopn each foobar.tmp file and extract the "wiki content"
# 4 - rebuild a clean html page
# 5 - "translate" the html page to dokuwiki code
use strict;
# Package to use File
use File::Find;
use File::Basename;
use File::Copy;
use File::Path; 
# Package to use Wiki
use HTML::WikiConverter;
use HTML::WikiConverter::DokuWiki;
# Package to read HTML file as a Tree
use HTML::TreeBuilder;
# Package to estimate the elapsed time of the execution of current script
use Date::Calc qw(Today_and_Now Delta_YMDHMS Add_Delta_YMDHMS Delta_DHMS Date_to_Text);
my $wc = new HTML::WikiConverter( 
	encoding =>   'utf8',
	dialect  =>   'DokuWiki',
	base_uri =>   'http://wiki.intranet/' , 
	wiki_uri => [ 'http://wiki.intranet/' ] 
#Starting Date
my ($y1,$m1,$d1,$h1,$mi1,$s1)= Today_and_Now();
if ($#ARGV != 1) {
# globals for callback
our ($src_dir, $dst_dir);
($src_dir, $dst_dir) = @ARGV;
# $src_dir = $ENV{HOME}.'/';
# for test purpoose
#$src_dir = $ENV{HOME}.'/mirror';
#$dst_dir = $ENV{HOME}.'/dokuwiki/data/pages';
# print "src_dir : $src_dir \ndst dir : $dst_dir\n";
# if the directory does not exists we exit
if (! -e $src_dir ) {
# build the final directory
mkpath([ $dst_dir ]);
# Startup on Zwiki2Dokuwiki
find (\&zwiki2dokuwiki,$src_dir);
#Elapsed Time
my ($y2,$m2,$d2,$h2,$mi2,$s2)=Today_and_Now();
my ($Dd,$Dh,$Dm,$Ds) =  Delta_DHMS($y1,$m1,$d1, $h1,$mi1,$s1, $y2,$m2,$d2, $h2,$mi2,$s2);
print "Elapse time: $Dd day(s) $Dh hre(s) $Dm mn $Ds s\n";
# recursive sub to read each files and subdirectory
sub zwiki2dokuwiki {
    my $src = $File::Find::name;
# relative path from $src_dir
    my $relative_pathname = $src;
    $relative_pathname =~ s!\Q$src_dir!!;
# destination
    my $dst = lc "$dst_dir/$relative_pathname";
    $dst =~ s/\.html$//;
#if the current "file" ( $_ )is a directory, we make 
    if ( -d ) {
	mkpath([ $dst ]);
# if its not a directory 
# we read the file
    } else {
# we do not read file with . in the name ; just copy them 
	if ( $src =~ /\./ )  {
	    if (! copy($src, $dst)) {
		warn("Can't copy file $src to $dst: $!\n");
# if not . 
	else {
	    my $tmp = $src . ".tmp";
#we will extract only the "body" identified by <div class="content"> 
#the rest (header and footer) are Zwiki specific 
#that is why we just extract this div and just this one
	    cleanup( $tmp, $src ) ;
	    if (! -e $dst."txt" ) {
		if (! copy($tmp, $dst)) {
		    warn("Can't copy file $tmp to $dst: $!\n");
		else {
		    print "Processing ",$dst,".txt\n";			
		    open FILE, ">$dst.txt";
		    print FILE $wc->html2wiki( file => $dst );
		    close FILE;
		    unlink $dst;
		    unlink $tmp;
sub cleanup {
# this sub will read a html file
# extract the div class=content 
# and create a clean html page with UTF 8 header
# get the parm given to  cleanup ( $tmp , $src) ;
    my ($filetmp,$filesrc) = @_;
#read the source file
    my $tree = HTML::TreeBuilder->new();
    my $root = $tree->find_by_tag_name('html');
#make the temp file
    open SRC, ">$filetmp";
    print SRC  "<html>\n";
    print SRC "\t<head>\n";
    print SRC "\t\t<meta http-equiv=\"Content-Type\"  content=\"text/html; charset=UTF-8\" />\n";
    print SRC "\t</head>\n";
#read the body content
    foreach my $body ($root->find_by_tag_name('body')) {
	print SRC "\t<body>\n";
# search the div
	foreach my $divs ($body->find_by_tag_name('div')) {
#isolate the <div class="content"> 
	    foreach my $div ($divs->find_by_attribute('class','content')) {
#if we find a form in the div ....
		foreach my $forms ($div->find_by_tag_name('form')) {
#... we remove it
		foreach my $uls ($div->find_by_tag_name('ul') ) {
		    foreach my $lis ($uls->find_by_tag_name('li') ) {
			foreach my $hrefs ($lis->find_by_tag_name('a') ) {
			    my $links = $hrefs->attr_get_i('href');
			    my ($content) = $hrefs->content_list();
			    if ( $links !~ /^(http|ftp|file)/) {
			    	# if the link contain things like 
				# <a href="Admin/FrontPage">AdminWiki:FrontPage</a>
				# we replace the / (slash) by : (semi column) 
				# thus Dokuwiki will "understand" the content of the href.
				$links =~ s/\//:/g;
# ~literal is the only way to 'rebuld' a clean link from scratch
# because as_HTML encode < and > to &gt; $lt;
				my $new_content = HTML::Element->new('~literal',
					'text' => '<a href="'.$links.'">'.$content.'</a>'
				#we replace the content no well formed for dokuwiki
				#by this new one
# we add the rest of the file to the temp file
		print SRC  $div->as_HTML();
# end of body
	print SRC  "\t</body>\n";
# end of the html page and close the tmp file
    print SRC "</html>";
    close SRC;

Installation in Dokuwiki

copy of the working directory to a Dokuwiki installation (which can be done in step 2 if you give the final directory on the command lines

