Utilisateur:Jona/process missing links.py

# -*- coding: utf-8 -*-
import sys, re, getopt

#TODO: add an interactive mode
#TODO: make a more formal usage output and options manager

def extract_links(articles):
    """Extract links in text

    Return a list"""

    llink = []
    fin = open(articles,'r')

    for line in fin:
	linelinkslist = re.findall('\[\[([^\:#]*?)\]\]', line) #Find link (e.g. : [[foo]]) without ":" or "#"
	for s in linelinkslist:
	    llink.append(re.search('([^\|]*)\|?.*',s).group(1)) #Copy only in the list the link not the text printed (e.g. : [[link|text printed]])

	#For translations in fr.wikt ({{trad|lan|link}} or {{trad+|lan|link}})
	llink.extend(re.findall('\{\{trad\+?\|[^\|]+?\|(.*?)\}\}', line))
    
    fin.close()
    return llink

def compute_occ(extendedl):
    """Compute occurence of each item

    Return a dict"""
    occ ={}
    for e in extendedl:
	occ[e] = occ.get(e,0) + 1
    return occ

def file_to_list(nameFile):
    f = open(nameFile,'r')
    l =[]
    for line in f:
	l.append(line.strip('\n'))
    f.close()
    return l

def list_to_file(l,nameFile):
    """Print a list to a file.
    For list not used in this module, the result can be unexpected"""
    if type(l) != list:
	print "Warning: The argument is not a list (list_to_file())"
	print "Unexpected behavior can occur"
    f = open(nameFile,'w')
    for s in l:
	if type(s) == tuple:
	    f.write(str(s[0])+"\t"+str(s[1])+'\n')
	elif type(s) == str:
	    f.write(s+'\n')
	else:
	    print "This format is not supported"
    f.close()

def dict_to_file(d,nameFile):
    """Print a dict to a file.
    For dict not used in this module, the result can be unexpected"""
    if type(d) != dict:
	print "Warning: The argument is not a dict (dict_to_file())"
	print "Unexpected behavior can occur"
    f = open(nameFile,'w')
    for s in d:
	f.write(s+' '+d[s]+'\n')
    f.close()

def make_diff(l1,l2):
    """Make the diff (l2 - l1)

    Return a list"""
    s1 = set(l1)
    s2 = set(l2)

    sdiff = s2 - s1
    ldiff = list(sdiff)
    return ldiff

def retrieve_occ(l,occ):
    """Join a list of items and number of occurence of thoss items given in a dict (occ)

    Return a list of tuple (occurence, item)"""
    locc = []
    for e in l:
	n = occ.get(e,0)
	locc.append((n, e))
    return locc

def format_and_write(l,outf):
    """Format the list of tuple and write it to a file"""
    fFinal = open(outf,'w')
    for line in l:
	fFinal.write('* [[%(link)s]] ([[Special:Whatlinkshere/%(link)s|%(number)d]])\n' % {'link': line[1],'number': line[0]})
    fFinal.close()

def usage():
    sys.stderr.write("""Options available are\n
-h --help	
-v --verbose	Enter verbose mode
-i --input	Specify an input directory
-o --output	Specify an output filename
-d		(nothing changing)\n""")


def main(argv):

    global _verbose
    _verbose = 0

    inf = '/out'
    titlef = '/outTitle'
    inputdir = "."
    outf = 'final'

    try:
        opts, args = getopt.getopt(argv, "hvi:o:d", ["help", "verbose", "input=", "output="])
    except getopt.GetoptError:
	sys.stderr.write("Illegal argument\n")
	usage()
	sys.exit(2)

    for opt, arg in opts:
	if opt in ("-h", "--help"):
	    usage()
	    sys.exit(0)
	elif opt == '-d':
	    global _debug
	    _debug = 1
	    #TODO: put debug condition
	elif opt in ("-v", "--verbose"):
	    _verbose = 1
	    #TODO: put verbose condition
	elif opt in ("-o", "--output"):
	    #TODO: verify that arg is a directory path
	    outf = arg
	elif opt in ("-i", "--input"):
	    #TODO: verify that arg is a directory path
	    inputdir = arg

    if _verbose:
	print 'Extracting links from "%s%s"...'% (inputdir, inf)
    llink = extract_links(inputdir+inf)
    if _verbose:
	print "Links extracted..."

    linksOcc = compute_occ(llink)
    if _verbose:
	print "Occurences computed..."

    # Read Title list
    if _verbose:
	print 'Reading title from "%s%s"...'% (inputdir, titlef)
    lt = file_to_list(inputdir+titlef)
    if _verbose:
	print "Title list read..."

    ldiff = make_diff(lt,linksOcc.keys())
    if _verbose:
	print "Missing links found..."

    mlocc = retrieve_occ(ldiff,linksOcc)
    if _verbose:
	print "Occurences of missing links added..."

    #Sort the result
    lOccSorted = sorted(mlocc,reverse=True)

    format_and_write(lOccSorted, outf)
    if _verbose:
	print 'Output written in "%s"'% outf

if __name__ == '__main__':
    main(sys.argv[1:])

    #To estimate the time
    ##import timeit
    ##t = timeit.Timer("main(sys.argv[1:])", "from __main__ import main")
    ##print t.repeat(3,5)