Utilisateur:PamputtBot/anagr.py

#!/usr/bin/env python
# coding: utf-8
# Ce script identifie les anagrammes depuis une liste de mots
# python3 pwb.py anagr_langue.py CODE_LANGUE
# python3 pwb.py anagr_langue.py CODE_LANGUE accueil

import os, sys

from unidecode import unidecode
import re
import pywikibot

test = False
pwd_path="/data/project/wiktionnaire-pamputt/anagramme/"

def enlever_diacritiques(mot):
    # peut nécessiter des adaptation si la langue considérée contient des lettres
    # diacritée à part entière. Par exemple, le vietnamien considère « a » et « â »
    # comme deux lettres différentes

    titre_sans_diacritique = "".join([unidecode(s.lower()) for s in mot])

    
    #if ('æ' in titre_sans_diacritique or
    #    'æ' in titre):
    #        print(titre + "/" + titre_sans_diacritique + " contient æ")
    #        titre_sans_diacritique = titre_sans_diacritique.replace('æ',"ae")
    
    #if 'œ' in titre_sans_diacritique:
    #        print(titre + " contient œ")
    #        titre_sans_diacritique = titre_sans_diacritique.replace('œ',"oe")
    
    #if ('½' in titre_sans_diacritique or
    #    '½' in titre):
    #        print(titre + "/" + titre_sans_diacritique + " contient ½")
    #        titre_sans_diacritique = titre_sans_diacritique.replace('½',"12")
    
    #if '¼' in titre_sans_diacritique:
    #        print(titre + " contient ¼")
    #        titre_sans_diacritique = titre_sans_diacritique.replace('¼',"14")
    
    to_remove = ["(", ")", "-", "'", "’", ".", ",", "/", " ", "_", ":", ";"]
    if any(x in titre_sans_diacritique for x in to_remove):
        matches = [x for x in to_remove if x in titre_sans_diacritique]
        for match in matches:
            #print(titre + " contient " + match)
            titre_sans_diacritique = titre_sans_diacritique.replace(match, "")

    return titre_sans_diacritique


def creerAlphagramme(titre):
    titre_sans_diacritique = enlever_diacritiques(titre)
    return "".join(sorted(titre_sans_diacritique)).strip()

def analyse_section_anagrammes(contenu, anagrammes, alphagramme):
    mots_presents = list()
    voirAnagrammes = False
    for ligne in contenu.splitlines():
        #print(ligne)
        if ligne.find("{{voir anagrammes|") != -1:
            voirAnagrammes = True
            return anagrammes, voirAnagrammes
                
        pos1 = ligne.find("[[")
        # si la section anagrammes ne contient pas de lien de
        # la forme [[mot]], alors c'est que les liens sont
        # peut-être de la forme {{lien|mot|CODE_LANGUE}} ou 
        # {{lien|mot}}
        if pos1 == -1:
            pos1 = ligne.find("{{lien|")
            while pos1 != -1:
                pos2 = ligne.find("|", pos1+7)
                # c'est de la forme {{lien|mot}}
                if pos2 == -1:
                    pos2 = ligne.find("}}", pos1+7)
                mot_present = ligne[pos1+7:pos2]
                #print(f"pos1: {pos1}, pos2: {pos2} -> mot: {mot_present}")
                    
                # on vérifie que le mot récupéré a le
                # même alphagramme
                if creerAlphagramme(mot_present) == alphagramme:                                
                    mots_presents.append(mot_present)
                        
                # on cherche si il y a d'autres liens
                # sur la même ligne
                pos1 = ligne.find("{{lien|", pos2)
                        
                
            while pos1 != -1:
                pos2 = ligne.find("]]", pos1)
                mot_present = ligne[pos1+2:pos2]
                #print(f"pos1: {pos1}, pos2: {pos2} -> mot: {mot_present}")
                #* [[école#fr|école]]
                pos1 = mot_present.find("|")
                if (pos1 != -1):
                    mot_present = mot_present[pos1+1:]
                    
                # on vérifie que le mot récupéré a le
                # même alphagramme
                if creerAlphagramme(mot_present) == alphagramme:                                
                    mots_presents.append(mot_present)
                    
                # on cherche si il y a d'autres liens
                # sur la même ligne
                pos1 = ligne.find("[[", pos2)

    for mot in mots_presents:
        #print(mot)
        if mot not in anagrammes:
            anagrammes.append(mot)

    return anagrammes, voirAnagrammes


# Modification du wiki
def modification(alphagramme, anagrammes, code_langue_a_traiter="fr", mot_a_traiter=""):
        nouvelle_section_anagramme = "=== {{S|anagrammes}} ===\n"
        nouvelle_section_anagramme += "{{voir anagrammes|" + code_langue_a_traiter + "}}"

        est_mot_a_traiter = False
        
        for titre in anagrammes:
                print ('Traitement de ' + titre + '!')

                if len(mot_a_traiter) > 0:
                    if titre == mot_a_traiter:
                        est_mot_a_traiter = True
                    else:
                        continue
                        
                try:
                        page = pywikibot.Page(pywikibot.Site(), titre)
                        #print(page)
                        #print(type(page))
                except UnicodeDecodeError as e: 
                        print (str(e))
                        if est_mot_a_traiter:
                                exit
                        return

                
                if not page.exists():
                        print (f"{titre} n'existe pas (encore)")
                        with open(pwd_path + "anagrammes_recuperees_a_verifier.txt", "a") as fichier:
                                fichier.write(titre + " n'existe pas\n")
                        if est_mot_a_traiter:
                                exit
                        return
                
                if page.namespace() != 0:
                        print (f"{titre} n'est pas dans le bon espace de nom")
                        with open(pwd_path + "anagrammes_recuperees_a_verifier.txt", "a") as fichier:
                                fichier.write(titre + " n'est pas dans l'espace de nom principal\n")
                        if est_mot_a_traiter:
                                exit
                        return
                
                try:
                        PageBegin = page.get()
                except pywikibot.exceptions.NoPageError as e:
                        print(str(e))
                        with open(pwd_path + "anagrammes_recuperees_a_verifier.txt", "a") as fichier:
                                fichier.write(titre + ";" + str(e) + "\n")
                        if est_mot_a_traiter:
                                exit
                        return
                except pywikibot.exceptions.LockedPageError as e:
                        print(str(e))
                        with open(pwd_path + "anagrammes_recuperees_a_verifier.txt", "a") as fichier:
                                fichier.write(titre + ";" + str(e) + "\n")
                        if est_mot_a_traiter:
                                exit
                        return
                except pywikibot.exceptions.IsRedirectPageError as e:
                        print(str(e))
                        with open(pwd_path + "anagrammes_recuperees_a_verifier.txt", "a") as fichier:
                                fichier.write(titre + ";" + str(e) + "\n")
                        if est_mot_a_traiter:
                                exit
                        return
                except pywikibot.exceptions.LockedPageError as e:
                        print(str(e))
                        with open(pwd_path + "anagrammes_recuperees_a_verifier.txt", "a") as fichier:
                                fichier.write(titre + ";" + str(e) + "\n")
                        if est_mot_a_traiter:
                                exit
                        return
                
                PageTemp = PageBegin
                PageEnd = u''

                ## la page ne contient pas de section en dans la langue demandée
                ## on quitte
                language_header = '{{langue|' + code_langue_a_traiter + '}}'
                if PageTemp.find(language_header) == -1:
                        print("pas de section " + code_langue_a_traiter +" !!!")
                        with open(pwd_path + "anagrammes_recuperees_a_verifier.txt", "a") as fichier:
                                fichier.write(titre + ";pas de section " + code_langue_a_traiter + "\n")
                        if est_mot_a_traiter:
                                exit
                        return

                ligne = ""
                sectionLangueATraiter = False
                anagrammesSection = False
                apresSection = False
                voirAnagrammes = False
                sectionAjoutee = False
                contenuAnagramme = ""
                a_verif_ajoute = False
                
                ## print PageTemp
                for ligne in PageTemp.splitlines():
                        #print(f"{ligne}: sectionLangueATraiter->{sectionLangueATraiter}")

                        ## print u'>>>>   ' + ligne
                        ## on a trouvé une autre section de langue
                        ## on garde cette info de côté
                        if (ligne.find(u'{{langue|') != -1 and
                            ligne.find(u'==') != -1 and
                            sectionLangueATraiter):
                                #print(f"{ligne}:\n ligne.find(u'{{langue|') != -1 ...")
                                sectionLangueATraiter = False
                                apresSection = True

                        language_hearder = '{{langue|' + code_langue_a_traiter + '}}'
                        if ligne.find(language_hearder) != -1:
                                #print(f"{ligne}")
                                sectionLangueATraiter = True

                        
                        if (ligne.find("{{S|anagr") != -1 and
                            sectionLangueATraiter):
                                anagrammesSection = True
                                
                        if ((ligne.find('{{S|voir') != -1 or
                            ligne.find('{{S|réf') != -1) and
                            sectionLangueATraiter):
                                apresSection = True
            

                        if (ligne.find(u'[[Catégorie:') != -1 and
                            sectionLangueATraiter):
                                apresSection = True
                                
                        if (ligne.find(u'{{clé de tri') != -1 and
                            sectionLangueATraiter):
                                apresSection = True

                        if anagrammesSection and not apresSection:
                                contenuAnagramme += ligne + "\n"
                                
                        #print(f"{ligne}\n** anagrammesSection: {anagrammesSection}\n** apresSection: {apresSection}\n** sectionAjoutee : {sectionAjoutee}\n** sectionLangueATraiter: {sectionLangueATraiter}")


                        # on crée le nouveau contenu à partir du contenu existant
                        # sauf pour la section "anagrammes" que l'on vide
                        # et remplace simplement par {{voir anagrammes|CODE_LANGUE}}
                        if not anagrammesSection:
                                # si on n'a pas trouvé de section anagramme
                                # mais qu'on a trouvé une section après (section
                                # voir, une catégorie, une clé de tri, ou une
                                # autre section de  langue) alors on ajoute une
                                # section anagramme avant la ligne en cours de lecture
                                if apresSection and not sectionAjoutee:
                                    sectionAjoutee = True
                                    PageEnd += nouvelle_section_anagramme + "\n"
                                # ne pas ajouter une ligne de trop
                                # à la fin du fichier
                                if apresSection:
                                    PageEnd += "\n" + ligne
                                else:
                                    PageEnd += ligne + "\n"
                        else:
                                # on a trouvé une section anagramme
                                # on attends d'avoir récupéré tout son
                                # contenu. Une fois finie, on l'analyse
                                # et on vérifie si il y des liens rouges
                                # (des mots qu'on n'a pas trouvé lors de
                                # l'analyse du dump)
                                # on ajoute ces liens rouges à la listes des
                                # anagrammes
                                if apresSection:
                                        if not sectionAjoutee:
                                                anagrammes, voirAnagrammes = \
                                                        analyse_section_anagrammes(contenuAnagramme, anagrammes, alphagramme)
                                                
                                                sectionAjoutee = True

                                                # si le modèle « voir anagrammes » est déjà présent dans la page
                                                # on le conserve, car il utilise peut-être un réglage manuel
                                                if voirAnagrammes:
                                                    PageEnd += contenuAnagramme[:-1] # suppression du dernier saut de ligne
                                                else:
                                                    PageEnd += nouvelle_section_anagramme + "\n"
                                        PageEnd += "\n" + ligne
                                # on est toujours dans la section anagrammes
                                # on continue de stocker son contenu dans
                                # contenuAnagramme en vue d'une analyse ultérieure
                                else:
                                        # dans la suite il y avait une autre section
                                        # (probablement mal placée). Il faut traiter
                                        # la page à la main
                                        if (ligne.find("==") != -1 and
                                            ligne.find("{{S|anagrammes}}") == -1 and
                                            not a_verif_ajoute):
                                            print(f"a verif: {ligne}")
                                            with open(pwd_path + "a_verifier.txt", "a") as fichier:
                                                fichier.write(titre + "\n")
                                            a_verif_ajoute = True
                                        continue

                # On est à la fin de la page et
                # on n'a pas encore ajouté de section anagramme
                if not sectionAjoutee:
                        anagrammes, voirAnagrammes = analyse_section_anagrammes(contenuAnagramme, anagrammes, alphagramme)
                        sectionAjoutee = True
                        # s'il n'y a pas déjà le modèle voir anagrammes
                        # alors on ajoute un saut de ligne
                        # pour ne pas être collé à la dernière section
                        if not anagrammesSection:
                                PageEnd += "\n"

                        if voirAnagrammes:
                            PageEnd += contenuAnagramme[:-1] # suppression du dernier saut de ligne
                        else:      
                                PageEnd += nouvelle_section_anagramme

                #print(f"PageEnd:{len(PageEnd)}, begin: {len(PageBegin)}, contenuAnagr: {len(contenuAnagramme)}, sectionAnagr: {len(nouvelle_section_anagramme)}")
                if len(PageEnd) < (len(PageBegin) - len(contenuAnagramme) + len(nouvelle_section_anagramme)):
                        print("Suppression trop importante de texte. Il y a peut-être un problème.")
                        with open(pwd_path + "a_verifier.txt", "a") as fichier:
                                fichier.write(titre + "\n")
                                
                        if len(mot_a_traiter) > 0:
                                if titre == mot_a_traiter:
                                        exit()
                        else:
                                continue
                        
                if len(mot_a_traiter) > 0:
                        if titre == mot_a_traiter:
                                creer_page_anagrammes(alphagramme, anagrammes, code_langue_a_traiter)
                                if PageEnd != PageBegin:
                                        #print(u'<<<<<<<<<<<<<<<<<<<<')
                                        #print(PageBegin)
                                        #print(u'>>>>>>>>>>>>>>>>>>>>')
                                        #print(PageEnd)
                                        #print(u'<<<<<<<<<<<<<<<<<<<<')
                                        sauvegarde(page, PageEnd)
                                exit()
                        else:
                                continue
                        
                else:
                        creer_page_anagrammes(alphagramme, anagrammes, code_langue_a_traiter)
                        if PageEnd != PageBegin:
                                #print(u'<<<<<<<<<<<<<<<<<<<<')
                                #print(PageBegin)
                                #print(u'>>>>>>>>>>>>>>>>>>>>')
                                #print(PageEnd)
                                #print(u'<<<<<<<<<<<<<<<<<<<<')
                                sauvegarde(page, PageEnd)


def analyse_modele_anagrammes(contenu, code_langue_a_traiter="fr"):
        mots_trouves = list()
        for ligne in contenu.splitlines():
                if(ligne.find("{{anagrammes|") != -1):
                        pos1 = ligne.find("|")
                        pos2 = ligne.find("}}")
                        mots_trouves = ligne[pos1+1:pos2].split("|")
        mots_trouves = [value for value in mots_trouves if value != "lang="+code_langue_a_traiter]
        return mots_trouves

def verif_mots_sont_alphagrammes(mots_trouves, alphagramme):
        fichier = open(pwd_path + "anagrammes_recuperees_a_verifier.txt", "a")
        
        for mot in mots_trouves:
                if creerAlphagramme(mot) != alphagramme:
                        fichier.write(f"{mot} n'est pas un alphagramme de {alphagramme}\n")

        fichier.close()

def creer_page_anagrammes(alphagramme, mots, code_langue_a_traiter="fr"):
        titre = "Modèle:anagrammes/" + code_langue_a_traiter + "/" + alphagramme

        try:
                page = pywikibot.Page(pywikibot.Site(), titre)
                #print(page)
        except UnicodeDecodeError as e: 
                print (str(e))
                return

        if page.exists() and page.namespace() == 10: #10 : modèle
                try:
                        PageBegin = page.get()
                except pywikibot.exceptions.NoPageError as e:
                        print (str(e))
                        return
                except pywikibot.exceptions.LockedPageError as e: 
                        print (str(e))
                        return
                except pywikibot.exceptions.IsRedirectPageError as e: 
                        print (str(e))
                        return
                except pywikibot.exceptions.LockedPageError as e: 
                        print (str(e))
                        return
        else:
                print ("Pas encore de page")
                PageBegin = ""

        mots_deja_presents = analyse_modele_anagrammes(PageBegin, code_langue_a_traiter)

        # on vérifie que les mots présents sont biens des alphagrammes
        # sinon on indique l'erreur dans un fichier
        verif_mots_sont_alphagrammes(mots_deja_presents, alphagramme)

        # On regarde si parmi les mots déjà présents, il
        # y en a qu'on n'aurait pas trouvé dans le dump
        # ça révèlerait un bogue dans ce programme ou
        # dans anagr.cpp
        for mot in mots_deja_presents:
                if mot not in mots:
                        print(f"{mot} n'a pas été trouvé dans le dump")
                        mots.append(mot)
                
        # Tri des anagrammes suivant l'ordre alphabétique
        # https://stackoverflow.com/a/16581759
        mots = sorted(mots, key=lambda L: (enlever_diacritiques(L), L))
                
        # Création du futur contenu
        PageEnd = "{{anagrammes|lang=" + code_langue_a_traiter 
        for mot in mots:
                PageEnd += "|" + mot
        PageEnd += "}}"

        # on enregistre
        if PageEnd != PageBegin:
                #print(u'<<<<<<<<<<<<<<<<<<<<')
                #print(PageBegin)
                #print(u'>>>>>>>>>>>>>>>>>>>>')
                #print(PageEnd)
                #print(u'<<<<<<<<<<<<<<<<<<<<')
                sauvegarde(page, PageEnd)


# Lecture du fichier liste_mot_par_langue.txt
def crawlerFile(source, code_langue_a_traiter="fr", mot_a_traiter=""):
        if not source:
            return

        PagesHS = open(source,"r")
        titre = ""
        codeLangue = ""
        ligne = ""
        alphagrammes = dict() # alphagramme = liste(mots)
        alphagrammes_par_langue = dict(dict()) # langue = dict(alphagrammes) (i.e. alphagramme -> liste(mots))
        ok = False
        non_ascii = re.compile(r'[^\x00-\x7f]')
        compteur = 0
        
        while True:
                ligne = PagesHS.readline()
                if ligne.find(u';') != -1:
                        titre = ligne[:ligne.find(u';')].strip()
                        codeLangue = ligne[ligne.find(u';')+2:-1]
                        #print(titre)
                else:
                        break

                if codeLangue != code_langue_a_traiter:
                        continue

                #if compteur > 100000:
                #        break

                # les préfixes et les suffixes sont ignorés
                if (titre[0] == '-' or
                    titre[-1] == '-'):
                        continue

                dictionary = alphagrammes_par_langue.get(codeLangue)
                alphagramme = creerAlphagramme(titre)
                # on ne s'intéresse qu'au caractère ASCII
                if non_ascii.search(alphagramme):
                        continue
                
                #print(">" + alphagramme)
                if dictionary:
                        ## si on a deja trouve cette suite de lettres
                        if alphagramme in alphagrammes:
                                # on ajoute le titre a la liste des anagrammes qu'il faudra traiter ulterieurement
                                #print(">>>   " + alphagramme + "--->" + titre)
                                #print(alphagrammes[alphagramme])
                                alphagrammes[alphagramme].append(titre)
                                alphagrammes_par_langue[codeLangue] = alphagrammes
                                #print(alphagramme)
                                #print(alphagrammes_par_langue[codeLangue][alphagramme])
                                compteur += 1

                        else:
                                # on crée une nouvelle liste
                                alphagrammes[alphagramme] = list()
                                alphagrammes[alphagramme].append(titre)
                                alphagrammes_par_langue[codeLangue] = alphagrammes
                                
                ## sinon on ajoute la nouvelle suite de lettres au dictionnaire
                else:
                        #print("On crée un nouveau dictionnaire pour le code de langue " + codeLangue)
                        alphagrammes[alphagramme] = list()
                        alphagrammes[alphagramme].append(titre)
                        alphagrammes_par_langue[codeLangue] = alphagrammes

                titre      = u''
                codeLangue = u''
                ## break

        PagesHS.close()

        # enregistrement du dictionnaire dans un fichier texte
        with open(pwd_path + code_langue_a_traiter +'.txt', 'w', encoding='utf8') as file:
                for key in alphagrammes_par_langue[code_langue_a_traiter]:
                        if len(alphagrammes_par_langue[code_langue_a_traiter][key]) > 1:
                                # si les mots n'ont pas les lettres dans le même ordre
                                premier_mot = enlever_diacritiques(alphagrammes_par_langue[code_langue_a_traiter][key][0])
                                save = False
                                for mot in alphagrammes_par_langue[code_langue_a_traiter][key]:
                                        #print("on traite " + mot)
                                        titre_sans_diacritique = enlever_diacritiques(mot)
                                        if(premier_mot != titre_sans_diacritique):
                                                #print("premier_mot ("+ premier_mot + ") est différent de " + titre_sans_diacritique)
                                                save = True
                                                break

                                if not save:
                                        continue
                                
                                for mot in alphagrammes_par_langue[code_langue_a_traiter][key]:
                                        # write each item on a new line
                                        file.write("%s;" % mot)
                                        file.write("\n")
                                        
                                modification(key, alphagrammes_par_langue[code_langue_a_traiter][key], code_langue_a_traiter, mot_a_traiter)



def sauvegarde(PageCourante, Contenu):
    if not test:
        PageCourante.put(Contenu, summary="nouveau système de gestion des anagrammes")
    else:
        pywikibot.output(f"{PageCourante.title()} :\n{Contenu}")


                
# Lancement
# Suppression des fichiers de travail
if os.path.exists(pwd_path + "a_verifier.txt"):
    # os.remove(pwd_path + "a_verifier.txt")
    os.replace(pwd_path + "a_verifier.txt", pwd_path + "a_verifier.txt.save")
        
if os.path.exists(pwd_path + "anagrammes_recuperees_a_verifier.txt"):
    os.remove(pwd_path + "anagrammes_recuperees_a_verifier.txt")
    
if(len(sys.argv) == 1 or
   len(sys.argv) > 3):
    print("python3 pwb.py anagr CODE_LANGUE [mot_à_traiter]")
    quit

        
code_langue = sys.argv[1].strip()
        
if(len(sys.argv)>2):
    mot_a_traiter = sys.argv[2].strip()
    TraitementFichier = crawlerFile(pwd_path + 'liste_mot_par_langue.txt', code_langue, mot_a_traiter)
else:
    TraitementFichier = crawlerFile(pwd_path + 'liste_mot_par_langue.txt', code_langue)