#!/usr/bin/perl # Copyright 2007 Gérald Sédrati-Dinet # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA use strict; use warnings; use LWP::UserAgent; use XML::Twig; use Unicode::String qw(utf8 latin1); use File::Basename; use locale; use POSIX 'locale_h'; use Getopt::Long; # Some files will be save under tree hierarchy based on executable location use FindBin qw($Bin); use lib "$Bin"; # Default values my $output_dir = "$Bin/xml/meps"; my $debug = 0; my $this_mep; # Parse command line options Getopt::Long::Configure("bundling"); GetOptions( 'mep|m=s' => \$this_mep, 'output|o=s' => \$output_dir, 'debug|d' => \$debug, 'help|h' => sub { print STDERR <] [-o : wiki name of the MEP to processed default: unset => all MEPs are processed --output, -o : output directory of xml pages default: $Bin/xml/meps --debug, -d debug default: no debug --help, -h: print this message USAGE exit 0; } ); my %groups = ( 'Groupe du Parti populaire européen (Démocrates-chrétiens) et des Démocrates européens' => 'PPE/DE', 'Groupe socialiste au Parlement européen' => 'PSE', 'Groupe Alliance des démocrates et des libéraux pour l\'Europe' => 'ALDE', 'Groupe Union pour l\'Europe des Nations' => 'UEN', 'Groupe des Verts/Alliance libre européenne' => 'Verts/ALE', 'Groupe confédéral de la Gauche unitaire européenne/Gauche verte nordique' => 'GUE/NGL', 'Groupe Indépendance/Démocratie' => 'IND/DEM', 'Groupe Identité, Tradition, Souveraineté' => 'ITS', 'Non-inscrits' => 'NI', ); # Create a user agent object my $ua = LWP::UserAgent->new; $ua->agent("$0 (gibus perl script to fetch information for wiki.ffii.fr)"); my $headers = HTTP::Headers->new('Accept-Language' => 'fr'); # Fetch list of regions my $regions_url = 'http://www.europarl.europa.eu/members/public/geoSearch/zoneList.do?country=FR&language=FR'; # Create a request my $regions_req = HTTP::Request->new(GET => $regions_url, $headers); # Pass request to the user agent and get a response back warn "Fetching $regions_url...\n"; my $regions_res = $ua->request($regions_req); # Check the outcome of the response unless ($regions_res->is_success) { die "Error fetching list of regions: $regions_url: ", $regions_res->status_line, "\n"; } warn "OK.\n"; # Parse each region foreach my $region_line (split /\n/, $regions_res->content) { next unless $region_line =~ /^\s+France : \2<\/a>$/o; my ($region_name_url, $region_name) = ($1, $2); # Fetch list of MEPs my $region_url = "http://www.europarl.europa.eu/members/public/geoSearch/search.do?country=FR&zone=$region_name_url&langage=FR"; my $region_req = HTTP::Request->new(GET => $region_url, $headers); warn "Fetching $region_url...\n"; my $region_res = $ua->request($region_req); unless ($region_res->is_success) { die "Error fetching list of MEPs: $region_url: ", $region_res->status_line, "\n"; } warn "OK.\n"; # Parse each MEP foreach my $mep_line (split /\n/, $region_res->content) { next unless $mep_line =~ /^\s+\2<\/a>$/o; my $mep_id = $1; # Fetch MEP my $mep_url = "http://www.europarl.europa.eu/members/expert/alphaOrder/view.do?language=FR&id=$mep_id"; warn "Fetching $mep_url...\n"; my $mep_req = HTTP::Request->new(GET => $mep_url, $headers); my $mep_res = $ua->request($mep_req); unless ($mep_res->is_success) { die "Error fetching deputy: $mep_url: ", $mep_res->status_line, "\n"; } my $mep_html = $mep_res->content; # Extract infos # Name my ($mep_first_name, $mep_last_name) = ($mep_html =~ /\n\s+?\n\s+(.+?)\n\s+(.+?)\n\s+<\/td>/os); $mep_last_name = capitalize($mep_last_name); my $mep_name = "$mep_first_name $mep_last_name"; my $mep_wiki_name = wikify($mep_name); next if (defined $this_mep and $mep_wiki_name ne $this_mep); # Picture my $mep_pic_url = "http://www.europarl.europa.eu/mepphoto/$mep_id.jpg"; my $mep_pic_location = "$Bin/IMG/eurodeputes/$mep_wiki_name.jpg"; $ua->mirror($mep_pic_url, $mep_pic_location) unless $debug; # Birth my ($mep_birth_gender, $mep_birth_date, $mep_birth_city) = ($mep_html =~ /\s+(Né(?:e?)) le (.+?),([^>]+)<\/td>/o); my ($mep_birth_day, $mep_birth_month, $mep_birth_year) = ($mep_birth_date =~ /^(\d+) (\S+) (\d+)$/o); $mep_birth_city =~ s/^\s+//o; $mep_birth_city =~ s/\s+$//o; # Political group my ($mep_grp, $mep_grp_role) = ($mep_html =~ /([^<]+)<\/span>
\s+\s+(\S+)/o); # Temporary hack for new fascist group $mep_grp =~ s/^body\.fullName\.2345_FR$/Groupe Identité, Tradition, Souveraineté/o; die "Unknown group <$mep_grp> for <$mep_url>\n" unless exists $groups{$mep_grp}; my $mep_grp_abbrev = $groups{$mep_grp}; (my $mep_party) = ($mep_html =~ /\s+([^<]+)
/o); $mep_party =~ s/\s+$//o; # Web (my $mep_web) = ($mep_html =~ /\s+
\s+Parlement européen<\/strong>
\s+Bât\. ([^<]+)
\s+([^<]+)
\s+60, rue Wiertz \/ Wiertzstraat 60
\s+B-1047 Bruxelles\/Brussel<\/strong>
\s+Tél<\/strong> : \+32 \(0\)2 28 45(\d\d\d)
\s+Fax<\/strong> : \+32 \(0\)2 28 49\3
/o); # Strasbourg Parliament my ($stb_building, $stb_office) = ($mep_html =~ /Parlement européen<\/strong>
\s+Bât\. ([^<]+)
\s+([^<]+)
\s+Allée du Printemps
\s+BP 10024\/F
\s+F-67070 Strasbourg Cedex<\/strong>
\s+Tél<\/strong> : \+33 \(0\)3 88 1 75$bxl_phone_ext
\s+Fax<\/strong> : \+33 \(0\)3 88 1 79$bxl_phone_ext
/); my @mep_address = ( { LABEL=>'Parlement européen', STREET=>'60, rue Wiertz', BUILDING=>$bxl_building, OFFICE=>$bxl_office, POSTCODE=>'1047', CITY=>'Bruxelles', TEL=>["+32 2 28 45 $bxl_phone_ext", "+32 2 28 47 $bxl_phone_ext"], FAX=>["+32 2 28 49 $bxl_phone_ext"], }, { LABEL=>'Parlement européen', STREET=>'Allée du Printemps BP 10024', BUILDING=>$stb_building, OFFICE=>$stb_office, POSTCODE=>'67070', CITY=>'Strasbourg Cedex', TEL=>["+33 3 88 175 $bxl_phone_ext", "+33 3 88 177 $bxl_phone_ext"], FAX=>["+33 3 88 179 $bxl_phone_ext"], } ); # Other addreess my ($other_address) = ($mep_html =~ /(.+?)<\/span>/os); $other_address =~ s/\s\s+//go; $other_address =~ s/^Adresses? postales? (?:: )?<\/strong>(?:
)?//o; foreach my $addr (split /
/, $other_address) { my @parts = reverse(split / /, $addr); my %addr; $addr{CITY} = capitalize(shift @parts); $addr{CITY} =~ s/(.)'/\l$1'/go; ($addr{POSTCODE} = shift @parts) =~ s/^FR-//o; $addr{STREET} = shift @parts; $addr{STREET} =~ s/'/'/go; $addr{STREET} =~ s/"//go; $addr{LABEL} = shift @parts if scalar @parts; # No need to repeat Parliament address (+ fix typos !!!) next if scalar @parts and $parts[$#parts] =~ /^Parl[ea]ment europ.+en$/; push @mep_address, \%addr; } # CV my ($mep_cv_txt) = ($mep_html =~ /]+>\s*\s*]+>Curriculum Vitae<\/td>\s*<\/tr>\s*(.+?)<\/td><\/tr>\s*<\/table>/os); $mep_cv_txt = "" unless $mep_cv_txt; $mep_cv_txt =~ s/\s\s+//go; $mep_cv_txt =~ s/]+">]+><\/td>/ /go; $mep_cv_txt =~ s/<\/?t[rd][^>]*>//go; $mep_cv_txt =~ s/'/'/go; $mep_cv_txt =~ s/"/'/go; $mep_cv_txt =~ s/^ //go; $mep_cv_txt =~ s/\.$//go; my @mep_cv_lines = map {ucfirst $_} split /\. (?!\d\d\d\d)| ?; /o, $mep_cv_txt; # Functions my ($mep_functions_txt) = ($mep_html =~ /\s*\s*]+>\s*(?:\s*<\/td><\/tr>\s*)*\s*]+>\s*\s*(.+?)\s*<\/td><\/tr>\s*<\/table>\s*(?:<\/td><\/tr>\s*\s*)*(?:|]+>\s*\s*]+>Curriculum Vitae<\/td>)/os); $mep_functions_txt = "" unless $mep_functions_txt; $mep_functions_txt =~ s/\s\s+//go; $mep_functions_txt =~ s/\n//go; $mep_functions_txt =~ s/<\/?tr[^>]*>//go; $mep_functions_txt =~ s/<\/?table[^>]*>//go; $mep_functions_txt =~ s/]+>[^<]+<\/a>//go; $mep_functions_txt =~ s/<\/td>//go; $mep_functions_txt =~ s/]+">]+><\/td>/ /go; $mep_functions_txt =~ s/<\/?td>//go; $mep_functions_txt =~ s/ /o, $mep_functions_txt; shift @mep_functions_by_role; my @mep_functions; foreach my $func (@mep_functions_by_role) { $func =~ s/^(.+?)(?=/o, $func; shift @func; map {push @mep_functions, {ROLE=>$role, VALUE=>$_}} @func; } # Build XML tree my $xml = XML::Twig->new(pretty_print => 'indented'); $xml->set_encoding('UTF-8'); my $root = XML::Twig::Elt->new('politician'); $xml->set_root($root); $root->insert_new_elt('first_child', 'container', 'meps.dbxml'); my $info = $root->insert_new_elt('last_child', 'infos'); my $name = $info->insert_new_elt('last_child', 'name'); $name->insert_new_elt('last_child', 'first', $mep_first_name); $name->insert_new_elt('last_child', 'last', $mep_last_name); $name->insert_new_elt('last_child', 'wiki', $mep_wiki_name); $info->insert_new_elt('last_child', 'picture', basename($mep_pic_location)); my $birth = $info->insert_new_elt('last_child', 'birth'); $birth->insert_new_elt('last_child', 'gender', $mep_birth_gender); my $birth_date = $birth->insert_new_elt('last_child', 'date'); $birth_date->insert_new_elt('last_child', 'day', $mep_birth_day); $birth_date->insert_new_elt('last_child', 'month', $mep_birth_month); $birth_date->insert_new_elt('last_child', 'year', $mep_birth_year); my $birth_place = $birth->insert_new_elt('last_child', 'place'); $birth_place->insert_new_elt('last_child', 'city', $mep_birth_city); my $constituency = $info->insert_new_elt('last_child', 'constituency'); my $region = $constituency->insert_new_elt('last_child', 'region'); $region->insert_new_elt('last_child', 'name', $region_name); my $group = $info->insert_new_elt('last_child', 'group'); $group->set_att('role', $mep_grp_role) if $mep_grp_role; $group->insert_new_elt('last_child', 'name', $mep_grp); $group->insert_new_elt('last_child', 'abbreviation', $mep_grp_abbrev); $group->insert_new_elt('last_child', 'party', $mep_party); my $contact = $root->insert_new_elt('last_child', 'contact'); $contact->insert_new_elt('last_child', web => { type => "Page sur le site du Parlement européen" }, $mep_url); $contact->insert_new_elt('last_child', 'web', $mep_web) if $mep_web; $contact->insert_new_elt('last_child', 'email', $mep_email) if $mep_email; if ($mep_guessed_email) { my $guessed_email = $contact->insert_new_elt('last_child', 'email', $mep_guessed_email); $guessed_email->set_att('guessed', 'true'); } foreach my $addr (@mep_address) { my $address = $contact->insert_new_elt('last_child', 'address'); $address->insert_new_elt('last_child', 'label', $addr->{LABEL}) if $addr->{LABEL}; $address->insert_new_elt('last_child', 'building', $addr->{BUILDING}) if $addr->{BUILDING}; $address->insert_new_elt('last_child', 'office', $addr->{OFFICE}) if $addr->{OFFICE}; $address->insert_new_elt('last_child', 'street', $addr->{STREET}) if $addr->{STREET}; $address->insert_new_elt('last_child', 'postcode', $addr->{POSTCODE}) if $addr->{POSTCODE}; $address->insert_new_elt('last_child', 'city', $addr->{CITY}) if $addr->{CITY}; if ($addr->{TEL}) { map {$address->insert_new_elt('last_child', 'phone', $_)} @{$addr->{TEL}}; } if ($addr->{FAX}) { map {$address->insert_new_elt('last_child', 'fax', $_)} @{$addr->{FAX}}; } } my $functions = $root->insert_new_elt('last_child', 'functions'); foreach my $func (@mep_functions) { my $function = $functions->insert_new_elt('last_child', 'function'); $function->insert_new_elt('last_child', 'label', $func->{VALUE}); $function->insert_new_elt('last_child', 'role', $func->{ROLE}); } my $cv = $root->insert_new_elt('last_child', 'cv'); foreach my $pos (@mep_cv_lines) { $cv->insert_new_elt('last_child', 'position', $pos); } my $activities = $root->insert_new_elt('last_child', 'activities'); $activities->insert_new_elt('last_child', 'questions', "http://www.europarl.europa.eu/omk/sipade3?PROG=QP&L=FR&SORT_ORDER=D&S_REF_QP=%&LEG_ID=6&AUTHOR_ID=$mep_id&MI_TEXT=brevet%25&F_MI_TEXT=brevet*"); $activities->insert_new_elt('last_child', 'motions', "http://www.europarl.europa.eu/sidesSearch/sipadeMapUrl.do?PROG=MOTION&L=FR&SORT_ORDER=D&S_REF_QP=%&LEG_ID=6&AUTHOR_ID=$mep_id&MI_TEXT=brevet%25&F_MI_TEXT=brevet*"); $activities->insert_new_elt('last_child', 'reports', "http://www.europarl.europa.eu/sidesSearch/sipadeMapUrl.do?PROG=REPORT&L=FR&SORT_ORDER=D&S_REF_QP=%&LEG_ID=6&AUTHOR_ID=$mep_id&MI_TEXT=brevet%25&F_MI_TEXT=brevet*"); $activities->insert_new_elt('last_child', 'declarations', "http://www.europarl.europa.eu/sidesSearch/search.do?type=WDECL&language=FR&term=6&author=$mep_id"); my $xml_location = "$output_dir/$mep_wiki_name.xml"; open XML, ">$xml_location" or die "Cannot write in file $xml_location: $!\n"; $xml->flush(\*XML); close XML or die "Error when closing $xml_location: $!\n"; warn "first=!$mep_first_name! last=!$mep_last_name! wiki=!$mep_wiki_name! day=!$mep_birth_day! month=!$mep_birth_month! year=!$mep_birth_year! city=!$mep_birth_city! role=!$mep_grp_role! grp=!$mep_grp! abbrev=!$mep_grp_abbrev! party=!$mep_party!".($mep_web?" web=!$mep_web!":'').($mep_email?" email=!$mep_email!":" ?$mep_guessed_email?")."\naddr=[ ".join(" ]\naddr=[ ", map({($_->{LABEL}?"label=!$_->{LABEL}! ":'')."street=!$_->{STREET}! postcode=!$_->{POSTCODE}! city=!$_->{CITY}! ".($_->{TEL}?"tel=!".join("/",@{$_->{TEL}})."!":'').($_->{FAX}?"fax=!".join("/",@{$_->{FAX}})."!":'')} @mep_address))."]\ncv=[ ".join(" ]\ncv=[ ", map({$_} @mep_cv_lines))." ]\nfunction=[ ".join(" ]\nfunction=[ ", map({$_->{VALUE}." (".$_->{ROLE}.")"} @mep_functions))." ]\n\n" if $debug; } } sub capitalize { my $str = shift; # Hack since accented letters cannot be changed in lower/upper case in utf8 setlocale(LC_CTYPE, "fr_FR"); $str = utf8($str)->latin1; $str =~ s/(\w+)/\u\L$1/go; $str = latin1($str)->utf8; setlocale(LC_CTYPE, "fr_FR.UTF-8"); # Particule should be in lower case $str =~ s/\bD([e'])\b/d$1/go; return $str; } sub wikify { my $str = shift; $str = desaccent($str); $str =~ s/-//go; return $str; } sub desaccent { my $str = shift; $str =~ s/À/A/go; $str =~ s/Â/A/go; $str =~ s/Ä/A/go; $str =~ s/É/E/go; $str =~ s/È/E/go; $str =~ s/Ê/E/go; $str =~ s/Ë/E/go; $str =~ s/Í/I/go; $str =~ s/Î/I/go; $str =~ s/Ï/I/go; $str =~ s/Ó/O/go; $str =~ s/Ô/O/go; $str =~ s/Ö/O/go; $str =~ s/Ù/U/go; $str =~ s/Û/U/go; $str =~ s/Ü/U/go; $str =~ s/Ç/C/go; $str =~ s/à/a/go; $str =~ s/â/a/go; $str =~ s/ä/a/go; $str =~ s/é/e/go; $str =~ s/è/e/go; $str =~ s/ê/e/go; $str =~ s/ë/e/go; $str =~ s/í/i/go; $str =~ s/î/i/go; $str =~ s/ï/i/go; $str =~ s/ó/o/go; $str =~ s/ô/o/go; $str =~ s/ö/o/go; $str =~ s/ù/u/go; $str =~ s/û/u/go; $str =~ s/ü/u/go; $str =~ s/ç/c/go; $str =~ s/[ ']//go; return $str; }