Utilisateur:FtiercelBot/fr-conj-cat.py

fr-conj-cat.py

#!/usr/bin/python
# -*- coding: utf-8  -*-

"""
This bot goes over multiple pages of the home wiki, and edits them without
changing. This is for example used to get category links in templates
working.

Don't forget to set the ftout to your current list of words,
see below for a line that looks like:
ftout =open('/home/cmillet/wikitruks/wiktio/all/2005-12-14.txt', 'r')

This script understands various command-line arguments:

    -start:        used as -start:page_name, specifies that the robot should
                   go alphabetically through all pages on the home wiki,
                   starting at the named page.

    -file:         used as -file:file_name, read a list of pages to treat
                   from the named textfile. Page titles should be enclosed
                   in [[double-squared brackets]].

    -ref:          used as -start:page_name, specifies that the robot should
                   touch all pages referring to the named page.

    -cat:          used as -cat:category_name, specifies that the robot should
                   touch all pages in the named category.

All other parameters will be regarded as a page title; in this case, the bot
will only touch a single page.
"""
import wikipedia, wiktionary, pagegenerators, catlib
import sys
import re
import time

#endings = [<ms>, <fs>, <mp>, <fp>]
#endings = [u'', u'e', u's', u'es']
#pronEnding = [u'', u'', u'', u'']
heading = u'Annexe:Conjugaison française:'
templateName = u'fr-conj-1'
alphanum = [u'a', u'b', u'c', u'd', u'e', u'f', u'g', u'h', u'i', u'j', u'k', u'l', u'm', u'n', u'o', u'p', u'q', u'r', u's', u't', u'u', u'v', u'w', u'x', u'y', u'z', u'0', u'1', u'2', u'3', u'4', u'5', u'6', u'7', u'8', u'9']
aList = [u'@', u'à', u'À', u'â', u'Â', u'ä', u'Ä']
cList = [u'ç']
eList = [u'€', u'é', u'É', u'è', u'È', u'ê', u'Ê', u'ë', u'Ë']
iList = [u'ï', u'Ï', u'î', u'Î']
nList = [u'ñ']
oList = [u'ô', u'Ô']
uList = [u'ù', u'Ù', u'û', u'Û', u'ü', u'Ü']
quoteList = [u'’', u'\'', u'\`', u'\"', u'\\', u'\/']
dotList = [u' ', u'.', u'&', u'~', u'{', u'(', u'[', u'-', u'|', u'_', u'^', u')', u']', u'=', u'°', u'+', u'=', u'}', u'£', u'$', u'¤', u'%', u'µ', u'*', u'?', u',', u';', u':', u'§', u'!', u'<', u'>']

commentCompiler = re.compile( u"\<\!\-\-(.*?)\-\-\>", re.DOTALL | re.MULTILINE)
templateCompiler = re.compile(u'\{\{' + templateName + u' *\r?\n?\| *.*?\}\}', re.DOTALL)
flexionCompiler = re.compile(u'\{\{' + templateName + u' *\r?\n?\| *(.*?)\}\}', re.DOTALL)
flexionParser = re.compile(u'\{\{' + templateName + u'.*?\}\}', re.DOTALL)
splitCompiler = re.compile(u'\|')
parameterCompiler = re.compile(u'^(.*?) *\= *(.*?)$')
spacesCompiler = re.compile(u'\s+')


class KeyBot:
  def __init__(self, generator, acceptall = False):
    self.generator = generator
    self.acceptall = acceptall
    
  def run(self):
    for page in self.generator:
      try:
        hasBadKey = False
        hasStrangeParameter = False
        wordBase = page.title()
        while len(wordBase) > 0 and heading + wordBase != page.title():
          wordBase = wordBase[1:]
        
##        if wordBase + endings[0] == page.title():
        wikipedia.output(u'page: %s' % page.title())
        thePage = page.get()
        theChangedPage = thePage # as newtext, but without comment
        # removing <!-- -->
        oldText = commentCompiler.sub(u'', thePage)
        # We need to do something here
        newText = oldText
        templateList = templateCompiler.findall(newText)
##          maxIteration = 100
##          while flexion and maxIteration > 0:
        for oldTemplate in templateList:
          wikipedia.output(u'Model trouvé :\n%s' % oldTemplate)
          flexion = flexionCompiler.search(oldTemplate)
          parameterList = splitCompiler.split(flexion.group(1))
          word = parameterList[0].strip(u'\r\n ')
          parameterList.pop(0)
          if len(parameterList) > 0:
            nothing = parameterList[0].strip(u'\r\n ')
            if nothing == u'':
              parameterList.pop(0)
##            
##            pron = parameterList[1].strip(u'\r\n ')
          
          parameters = {}
          parameterIndex = []
          for parameter in parameterList:
            parameter = parameter.strip(u'\r\n ')
            parameterElmnt = parameterCompiler.search(parameter)
            if parameterElmnt:
              parameterIndex.append(parameterElmnt.group(1))
              parameters[parameterElmnt.group(1)] = parameterElmnt.group(2)
            else:
              hasStrangeParameter = True

          if u'cat' not in parameters:
            parameters[u'cat'] = wordBase
            parameterIndex.append(u'cat')

##          specialParameters = dict(parameters)
##          if u'v1' in parameters:
##            specialParameters.pop(u'v1')
##          if u'v2' in parameters:
##            specialParameters.pop(u'v2')
##          if u'c1' in parameters:
##            specialParameters.pop(u'c1')
##          if u'c2' in parameters:
##            specialParameters.pop(u'c2')
##            
##          if u'cat' in parameters:
##            specialParameters.pop(u'cat')
##          if u'réfl' in parameters:
##            specialParameters.pop(u'réfl')
##          if u'aux-être' in parameters:
##            specialParameters.pop(u'aux-être')
##          if u'\'' in parameters:
##            specialParameters.pop(u'\'')
##          if u'impers' in parameters:
##            specialParameters.pop(u'impers')
##          if u'impers.sing' in parameters:
##            specialParameters.pop(u'impers.sing')
##            
##          if u'pron' in parameters:
##            specialParameters.pop(u'pron')
##          if u'p.v1' in parameters:
##            specialParameters.pop(u'p.v1')
##          if u'p.v2' in parameters:
##            specialParameters.pop(u'p.v2')
##          if u'pc' in parameters:
##            specialParameters.pop(u'pc')
##          if u'ill' in parameters:
##            specialParameters.pop(u'ill')
##          if u'j' in parameters:
##            specialParameters.pop(u'j')
##          if u'e' in parameters:
##            specialParameters.pop(u'e')
##          
##          if len(specialParameters) != 0 or hasStrangeParameter:
##            theTitle = page.title()
##            encodedTitle = theTitle.encode('utf-8')
##            outputFile.write(encodedTitle)
##            outputFile.write("\r\n")

          # Generate the order key
          theKey = u''
          for character in wordBase.lower():
            if character in alphanum:
              theKey += character
            elif character in aList:
              theKey += u'a'
            elif character in cList:
              theKey += u'c'
            elif character in eList:
              theKey += u'e'
            elif character in iList:
              theKey += u'i'
            elif character in nList:
              theKey += u'n'
            elif character in oList:
              theKey += u'o'
            elif character in uList:
              theKey += u'u'
            elif character == u'æ':
              theKey += u'ae'
            elif character == u'œ':
              theKey += u'oe'
            elif character in quoteList:
              pass
            elif character in dotList:
              theKey += u' '
            else:
              theTitle = wordBase
              encodedTitle = theTitle.encode('utf-8')
              outputFile.write(encodedTitle)
              outputFile.write("\r\n")
              theKey += u' '

          # Remove starting and ending spaces and also double spaces
          theKey = theKey.strip()
          theKey = spacesCompiler.sub(u' ',theKey)
          if parameters[u'cat'] != theKey:
            wikipedia.output(u'Cle de tri : %s' % wordBase)

            if theKey == wordBase:
              if u'cat' in parameters:
                parameterIndex.remove(u'cat')
              parameters.pop(u'cat')
            else:
              if u'cat' not in parameters:
                parameterIndex.append(u'cat')
              parameters[u'cat'] = theKey
              
            otherParameters = u''
            for key in parameterIndex:
              wikipedia.output(u'key: %s' % key)
              otherParameters += u'|%s=%s\n' % (key, parameters[key])
            
            newTemplate = u'{{%s|%s|\n%s}}' % (templateName, word, otherParameters)
            newText = re.sub(re.escape(oldTemplate), newTemplate, newText, 1)
            theChangedPage = re.sub(re.escape(oldTemplate), newTemplate, theChangedPage, 1)
            hasBadKey = True
##            maxIteration -= 1
          
        # we upload the text
        if newText != oldText and hasBadKey:
          wikipedia.output(u'MISE A JOUR : %s' % wordBase)
          theTitle = wordBase
          encodedTitle = theTitle.encode('utf-8')
          articlesToChangeFile.write(encodedTitle)
          articlesToChangeFile.write("\r\n")
      
      except wikipedia.NoPage:
          wikipedia.output(u'Page %s does not exist?!?!'%page.aslink())
      except wikipedia.IsRedirectPage:
          pass
      except wikipedia.LockedPage:
          pass

def main():
    #page generator
    gen = None
    pageTitle = []
    for arg in wikipedia.handleArgs():
        if arg:
            if arg.startswith('-start:'):
                gen = pagegenerators.AllpagesPageGenerator(arg[7:])
            elif arg.startswith('-ref:'):
                referredPage = wikipedia.Page(wikipedia.getSite(), arg[5:])
                gen = pagegenerators.ReferringPageGenerator(referredPage)
            elif arg.startswith('-links:'):
                linkingPage = wikipedia.Page(wikipedia.getSite(), arg[7:])
                gen = pagegenerators.LinkedPageGenerator(linkingPage)
            elif arg.startswith('-file:'):
                gen = pagegenerators.TextfilePageGenerator(arg[6:])
            elif arg.startswith('-cat:'):
                catGen = pagegenerators.TextfilePageGenerator('./cat.txt')
                catPreloadingGen = pagegenerators.PreloadingGenerator(catGen)
                for catPage in catPreloadingGen:
                  cat = catlib.Category(wikipedia.getSite(), catPage.title())
                gen = pagegenerators.CategorizedPageGenerator(cat)
            else:
                pageTitle.append(arg)

    if pageTitle:
        page = wikipedia.Page(wikipedia.getSite(), ' '.join(pageTitle))
        gen = iter([page])
    if not gen:
        wikipedia.showHelp('touch')
    else:
        preloadingGen = pagegenerators.PreloadingGenerator(gen)
        bot = KeyBot(preloadingGen)
        bot.run()

if __name__ == "__main__":

    now = time.localtime()
    filename = './strangeArticles-' + str(now.tm_hour) + '-' + str(now.tm_min) + '-' + str(now.tm_sec) + '.txt'
    outputFile =open(filename, 'w')
    filename = './articlesToChange-' + str(now.tm_hour) + '-' + str(now.tm_min) + '-' + str(now.tm_sec) + '.txt'
    articlesToChangeFile =open(filename, 'w')
    
    try:
        main()
    finally:
        wikipedia.stopme()
    
    outputFile.close()
    articlesToChangeFile.close()