# -*- coding: utf-8 -*-
import sys, re, getopt
#TODO: add an interactive mode
#TODO: make a more formal usage output and options manager
def extract_links(articles):
"""Extract links in text
Return a list"""
llink = []
fin = open(articles,'r')
for line in fin:
linelinkslist = re.findall('\[\[([^\:#]*?)\]\]', line) #Find link (e.g. : [[foo]]) without ":" or "#"
for s in linelinkslist:
llink.append(re.search('([^\|]*)\|?.*',s).group(1)) #Copy only in the list the link not the text printed (e.g. : [[link|text printed]])
#For translations in fr.wikt ({{trad|lan|link}} or {{trad+|lan|link}})
llink.extend(re.findall('\{\{trad\+?\|[^\|]+?\|(.*?)\}\}', line))
fin.close()
return llink
def compute_occ(extendedl):
"""Compute occurence of each item
Return a dict"""
occ ={}
for e in extendedl:
occ[e] = occ.get(e,0) + 1
return occ
def file_to_list(nameFile):
f = open(nameFile,'r')
l =[]
for line in f:
l.append(line.strip('\n'))
f.close()
return l
def list_to_file(l,nameFile):
"""Print a list to a file.
For list not used in this module, the result can be unexpected"""
if type(l) != list:
print "Warning: The argument is not a list (list_to_file())"
print "Unexpected behavior can occur"
f = open(nameFile,'w')
for s in l:
if type(s) == tuple:
f.write(str(s[0])+"\t"+str(s[1])+'\n')
elif type(s) == str:
f.write(s+'\n')
else:
print "This format is not supported"
f.close()
def dict_to_file(d,nameFile):
"""Print a dict to a file.
For dict not used in this module, the result can be unexpected"""
if type(d) != dict:
print "Warning: The argument is not a dict (dict_to_file())"
print "Unexpected behavior can occur"
f = open(nameFile,'w')
for s in d:
f.write(s+' '+d[s]+'\n')
f.close()
def make_diff(l1,l2):
"""Make the diff (l2 - l1)
Return a list"""
s1 = set(l1)
s2 = set(l2)
sdiff = s2 - s1
ldiff = list(sdiff)
return ldiff
def retrieve_occ(l,occ):
"""Join a list of items and number of occurence of thoss items given in a dict (occ)
Return a list of tuple (occurence, item)"""
locc = []
for e in l:
n = occ.get(e,0)
locc.append((n, e))
return locc
def format_and_write(l,outf):
"""Format the list of tuple and write it to a file"""
fFinal = open(outf,'w')
for line in l:
fFinal.write('* [[%(link)s]] ([[Special:Whatlinkshere/%(link)s|%(number)d]])\n' % {'link': line[1],'number': line[0]})
fFinal.close()
def usage():
sys.stderr.write("""Options available are\n
-h --help
-v --verbose Enter verbose mode
-i --input Specify an input directory
-o --output Specify an output filename
-d (nothing changing)\n""")
def main(argv):
global _verbose
_verbose = 0
inf = '/out'
titlef = '/outTitle'
inputdir = "."
outf = 'final'
try:
opts, args = getopt.getopt(argv, "hvi:o:d", ["help", "verbose", "input=", "output="])
except getopt.GetoptError:
sys.stderr.write("Illegal argument\n")
usage()
sys.exit(2)
for opt, arg in opts:
if opt in ("-h", "--help"):
usage()
sys.exit(0)
elif opt == '-d':
global _debug
_debug = 1
#TODO: put debug condition
elif opt in ("-v", "--verbose"):
_verbose = 1
#TODO: put verbose condition
elif opt in ("-o", "--output"):
#TODO: verify that arg is a directory path
outf = arg
elif opt in ("-i", "--input"):
#TODO: verify that arg is a directory path
inputdir = arg
if _verbose:
print 'Extracting links from "%s%s"...'% (inputdir, inf)
llink = extract_links(inputdir+inf)
if _verbose:
print "Links extracted..."
linksOcc = compute_occ(llink)
if _verbose:
print "Occurences computed..."
# Read Title list
if _verbose:
print 'Reading title from "%s%s"...'% (inputdir, titlef)
lt = file_to_list(inputdir+titlef)
if _verbose:
print "Title list read..."
ldiff = make_diff(lt,linksOcc.keys())
if _verbose:
print "Missing links found..."
mlocc = retrieve_occ(ldiff,linksOcc)
if _verbose:
print "Occurences of missing links added..."
#Sort the result
lOccSorted = sorted(mlocc,reverse=True)
format_and_write(lOccSorted, outf)
if _verbose:
print 'Output written in "%s"'% outf
if __name__ == '__main__':
main(sys.argv[1:])
#To estimate the time
##import timeit
##t = timeit.Timer("main(sys.argv[1:])", "from __main__ import main")
##print t.repeat(3,5)