Utilisateur:Jona/xml extract.py

#! /usr/bin/env python
# -*- coding: utf-8 -*-

#Take a dump from a wikimedia project and remove all the non-main namespace pages.
#Return two files:
#One 'out' include all the article in the main namespace (those without ":" after the first character).
#The second 'outTitle' is a list (one title per line) of all the page titles.

from __future__ import unicode_literals
from xml.sax import make_parser
from xml.sax.handler import ContentHandler
import sys, os, getopt, shutil, bz2, mimetypes

#TODO: Avoid parsing into html comments <!-- -->

class PreProcHandler(ContentHandler):

    def __init__ (self):
        self.isTitle, self.isText, self.isFirstText = 0,0,1

    def startDocument(self):
        self.f = open(outputdir+'/out', 'w')
        self.fTitle = open(outputdir+'/outTitle', 'w')

    def endDocument(self):
        self.f.close()
        self.fTitle.close()

    # Opening tag found, change flags
    def startElement(self, name, attrs):
        if name == 'title':
            self.isTitle = 1
            self.isFirstText = 1
            self.title = ""
        elif (name == 'text' and self.isFirstText == 1):
            self.isText = 1
            self.text = ""
        return

    # Fill the buffer
    def characters(self, ch):
        if self.isTitle == 1:
            self.title += ch
        if (self.isText == 1 and self.isFirstText == 1):
            self.text +=ch

    # Closing tag found, change flags and write buffers
    def endElement(self,name):
        if name == 'title':
            self.isTitle = 0
        elif name == 'text':
            self.isText = 0
            self.isFirstText = 0
        elif name == 'page':
            indexBeforeColon = self.title.find(":",1,-1)
            if (indexBeforeColon == -1 or (self.title[0:indexBeforeColon] not in prefixList)):
                self.f.write(unicode('<title>').encode("utf-8")+self.title.encode("utf-8")+unicode('</title>\n').encode("utf-8")+self.text.encode("utf-8")+unicode('\n').encode("utf-8"))
                self.fTitle.write(self.title.encode("utf-8")+unicode('\n').encode("utf-8"))
                #print '<title>'+self.title+'</title>\n',self.text
            if _debug :
                # Show prefix in title (like "talk" in "talk:dog") but not in the current list
                prefix = self.title[0:indexBeforeColon]
                if (indexBeforeColon != -1 and (prefix not in prefixList)):
                    print "[DEBUG]", prefix," , like in ",self.title

def usage():
    sys.stderr.write("""Options available are\n
-h --help	Show this help
-i --input	Give the input file (it's "xml_articles.xml" by default)
-o --ouput	The output directory, where "out" and "outTitle" will be written
-v --verbose	(nothing changing)
-d	show debug info\n""")

def main(argv):
    global outputdir
    
    global _verbose
    global _debug
    _verbose = 0
    _debug = 0
    
    global  prefixList
    prefixList = ["Wiktionnaire", "MediaWiki", "Annexe", "Modèle", "Fichier", "Aide", "Thésaurus", "Catégorie", "Projet", "Portail", "Transwiki", "WT"]

    source = "xml_articles.xml"
    outputdir = "."
    try:
        opts, args = getopt.getopt(argv, "hi:o:vd", ["help", "input=", "output=","verbose"])
    except getopt.GetoptError:
	sys.stderr.write("Illegal argument\n")
	usage()
	sys.exit(2)
    #TODO: add a condition like "-c" (copy) to have an inputless script
    for opt, arg in opts:
	if opt in ("-h", "--help"):
	    usage()
	    sys.exit(0)
	elif opt == '-d':
	    _debug = 1
	elif opt in ("-v", "--verbose"):
	    _verbose = 1
	    #TODO: put verbose condition
	elif opt in ("-i", "--input"):
	    source = arg
	elif opt in ("-o", "--output"):
            if not os.path.isdir(arg):
                sys.stderr.write(" '%s' is not a directory\n"% arg)
                usage()
                sys.exit(2)
	    outputdir = arg

    if not os.path.isfile(source):
	sys.stderr.write("The input file '%s' was not found\n"% source)
	usage()
	sys.exit(2)

    inputType = mimetypes.guess_type(source)
    if (inputType[0] != "application/xml"):
        print "Input file is of type : %s; This is strange" % str(inputType)
    # It will look if there are old files that has to be backuped
    # If it find ones, it will ask if they have to be copied (default), overwrite (yes) or quit the program (no)
    if os.path.isfile(outputdir+"/out") or os.path.isfile(outputdir+"/outTitle"):
	doicontinue=raw_input("Ouput file(s) already exist, overwrite ? [C/y/n] (by default copy the old file to *.old; y : to overwrite; n : to abort): ")
	if doicontinue.lower() == "y":
		pass
	elif doicontinue.lower() == "n":
		print "operation aborted by user"
		sys.exit(2)
	else :
		if os.path.isfile(outputdir+"/out"):
			shutil.copyfile(outputdir+"/out", outputdir+"/out.old")
		if os.path.isfile(outputdir+"/outTitle"):
			shutil.copyfile(outputdir+"/outTitle", outputdir+"/outTitle.old")
		print "Old files copied to *.old"

    print "It can now take several minutes..."
    parser = make_parser()   
    curHandler = PreProcHandler()
    parser.setContentHandler(curHandler)
    if (inputType[1] == "bzip2"): 
        parser.parse(bz2.BZ2File(source,'r'))
    elif (inputType [1] == None):
        parser.parse(source)
    else :
        sys.stderr.write("The input file type '%s' was not recognized\n"% inputType[1])
        usage()
        sys.exit(2)


if __name__ == "__main__":
    main(sys.argv[1:])