#!/usr/bin/perl
use strict ;
use utf8 ;
use open ':utf8';
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");
my $delay = 1 ;
my $delay = 30 ;
my $srcdir = "/home/jean/wfs/wiktionary-fr";
#my $dstdir = "/home/jean/new";
my $dstdir = "/home/jean/wfs/wiktionary-fr";
# Lit la liste des caractères a traiter
my @caracteres = () ;
open(LST,"/home/jean/db/jeudessai") || die "cannot open hanja list\n";
binmode(LST, ":utf8");
while(<LST>) {
chop;
push @caracteres, $_;
}
close(LST);
# Lecture eumhun
my %hanja = () ;
open(H,"/home/jean/db/nabi-hanja.txt");
binmode(H, ":utf8");
while(<H>) {
my $k ; my $v ;
next if(/^\[/) ;
chop;
($k,$v) = split(/=/,$_);
if ($v =~ /^(.)/) {
$v =~ s/(.)의 譌字/(forme incorrecte de [[$1]])/;
$v =~ s/(.)의 簡體字/(forme incorrecte de [[$1]])/;
$v =~ s/(.)(과|와) 同字/(forme alternative de [[$1]])/;
$v =~ s/(.)·(.)(과|와) 同字/(forme alternative de [[$1]] et [[$2]])/;
$v =~ s/(.)의 古字/(forme archaïque de [[$1]])/;
$v =~ s/(.)의 本字/(forme originale de [[$1]])/;
$v =~ s/(.)의 俗字/(forme familière de [[$1]])/;
$v =~ s/(.)의 訛字/(forme incorrecte pour [[$1]])/;
$v =~ s/(.)의 正字/(forme correcte de [[$1]])/;
$v =~ s/(.)의 갖은자/(variante sémantique de [[$1]])/;
$v =~ s/(.)의 略字/(forme abregée de [[$1]])/;
if ($hanja{$k}) {
$hanja{$k} .= ", " . $v ;
} else {
$hanja{$k} = $v ;
}
}
}
close(H);
# On lit l'article et on le decompose en 4 parties :
# - avant le bloc hanja
# - le bloc hanja en lui meme (pour lequel on récupère les valeurs des items)
# - apres le bloc hanja
# - la fin (si le bloc hanja est absent)
#
my @listitem = ('Hangul','roman-ko', 'McCune', 'eumhun', 'Yale-ko' ) ;
my $ignore = '(^$|-pron|ébauche|\{\{API\}\}|\{\{SAMPA\}\})|Romanisation :|\{\{-réf-\}\}|\{\{R:libhangul\}\}' ;
my $car = '' ;
foreach $car (@caracteres) {
print STDERR "DEBUG : traitement de $car...\n" ;
my $etat = 0 ;
my $avant = "";
my $hanjabloc = "";
my $apres = "";
my $ja = "";
my $voir = "";
my $item = '';
my %val = () ;
open(A,"$srcdir/$car");
binmode(A, ":utf8");
while(<A>) {
# Debut du paragraphe "hanja"
if(/\[\[Catégorie/ || /\[\[..:$car/) {
print STDERR "etat=apres\n";
$apres .= $_ ;
$etat = 2;
} elsif(/\{\{-voir-\}\}/) {
print STDERR "etat=voir\n";
$etat = 4;
$voir .= $_;
} elsif(/\{\{=ja=\}\}/) {
print STDERR "etat=ja\n";
$ja .= $_ ;
$etat = 3;
} elsif(/\{\{=ko-hanja=\}\}/) {
print STDERR "etat=ko\n";
$etat = 1;
} elsif($etat == 1) {
if (/\{\{=/ || /\[\[en/ || /\{\{-voir/) {
print STDERR "etat=apres (ko)\n";
$apres .= $_ ;
$etat = 2;
} else {
chop;
next if(/$ignore/) ;
my $found = 0 ;
foreach $item (@listitem) {
if(/\{\{$item\}\}\s*:\s*(\S*.*)/) {
$val{$item} = $1 ;
$found = 1;
}
}
if ($found ==0 ) {
print STDERR "DEBUG : $_ : no item found\n" ;
}
}
} elsif($etat == 4) {
$voir .= $_;
} elsif($etat == 3) {
$ja .= $_ ;
} elsif($etat == 2) {
$apres .= $_ ;
} elsif($etat == 0) {
$avant .= $_ ;
}
}
close(A);
# On prend le eumhun de nabi-hanja.txt
if ($hanja{$car}) {
$val{'eumhun'} = $hanja{$car} ;
# On recupere le hangul de eumhun
my @tmp = split(/,/ , $val{'eumhun'}) ;
my %hangul = () ;
my $eumhun = '' ;
foreach $eumhun (@tmp) {
if ($eumhun =~ /\s*(\S)\s*$/) {
my $hangulcar = $1 ;
my $n = ord($hangulcar) ;
if ($n>=0xac00 && $n<=0xd7a3) {
$hangul{$1} = 1;
}
}
}
my $newhangul = join(", ",keys(%hangul)) ;
# Si hangul de l'article different de eumhun, on le remplace
if ($val{'Hangul'} ne $newhangul) {
$val{'Hangul'} = $newhangul
}
} else {
print STDERR "pas de eumhun pour $car\n";
}
# Si le hangul existe...
if($val{'Hangul'}) {
PrivoxyWindowOpen(NEW,">$dstdir/$car");
# On réordonne les valeurs du block hanja
print NEW $avant, $ja, $voir, "
{{=ko-hanja=}}
{{ébauche|ko}}
* {{Hangul}} : ",$val{'Hangul'},"
* {{eumhun}} : ",$val{'eumhun'},"
* Romanisation :
** {{roman-ko}} : ",$val{'roman-ko'},"
** {{McCune}} : ",$val{'McCune'},"
** {{Yale-ko}} : ",$val{'Yale-ko'},"
{{-réf-}}
{{R:libhangul}}
",
$apres,
"
[[Summary: reformatage section hanja, ajout eumhun, replacement -voir-]]
"
;
close(NEW);
} else {
print STDERR "Pas de modification pour $car\n";
}
print STDERR "attente de $delay secondes...\n";
sleep($delay);
}