#!/usr/bin/perl # Copyright 2007 Gérald Sédrati-Dinet # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA use strict; use warnings; use LWP::UserAgent; use XML::Twig; use Unicode::String qw(utf8 latin1); use File::Basename; use Getopt::Long; # Some files will be save under tree hierarchy based on executable location use FindBin qw($Bin); use lib "$Bin"; # Default values my $output_dir = "$Bin/xml/mps"; my $debug = 0; my $this_mp; # Parse command line options Getopt::Long::Configure("bundling"); GetOptions( 'mp|m=s' => \$this_mp, 'output|o=s' => \$output_dir, 'debug|d' => \$debug, 'help|h' => sub { print STDERR <] [-o : wiki name of the MP to processed default: unset => all MPs are processed --output, -o : output directory of xml pages default: $Bin/xml/mps --debug, -d debug default: no debug --help, -h: print this message USAGE exit 0; } ); my %groups = ( 'Union pour un Mouvement Populaire' => 'UMP', 'Socialiste' => 'PS', 'Union pour la Démocratie Française' => 'UDF', 'Député-e-s Communistes et Républicains' => 'PCF', 'Députés n\'appartenant à aucun groupe' => '?', ); my %no_groups = ( 'Gérard Charasse' => 'PRG', 'Émile Zuccarelli' => 'PRG', 'Noël Mamère' => 'Verts', 'Martine Billard' => 'Verts', 'Yves Cochet' => 'Verts', 'Édouard Leveau' => 'CNIP (divers-droite)', 'Véronique Besse' => 'MPF', 'Joël Sarlot' => 'MPF', 'Alfred Marie-Jeanne' => 'MIM (divers-gauche)', 'Huguette Bello' => 'PCR', 'Pierre-Christophe Baguet' => 'ex UDF', 'Philippe Edmond-Mariette' => 'BPM (divers-gauche)', ); # Create a user agent object my $ua = LWP::UserAgent->new; $ua->agent("$0 (gibus perl script to fetch information for wiki.ffii.fr)"); my $headers = HTTP::Headers->new('Accept-Language' => 'fr'); # Fetch list of departments my $departments_url = 'http://www.assemblee-nationale.fr/12/qui/circonscriptions/'; # Create a request my $departments_req = HTTP::Request->new(GET => $departments_url, $headers); # Pass request to the user agent and get a response back my $departments_res = $ua->request($departments_req); # Check the outcome of the response unless ($departments_res->is_success) { die "Error fetching list of departments: $departments_url: ", $departments_res->status_line, "\n"; } # Parse each department foreach my $department_line (split /\n/, $departments_res->content) { $department_line = latin1($department_line)->utf8; # Dirty hack to fix formating for Guadeloupe $department_line = ' GUADELOUPE 971
' if $department_line eq ' GUADELOUPE 971
'; next unless $department_line =~ /([^<]+)<\/font><\/b> ([^<]+)<\/a>/o; my ($department_name, $department_url, $department_number) = ($1, $2, $3); my $constituencies_html; my @constituencies; if ($department_url =~ /^\d\d[ab\d]\.asp$/) { # Fetch list of constituencies $department_url = "http://www.assemblee-nationale.fr/12/qui/circonscriptions/$department_url"; my $department_req = HTTP::Request->new(GET => $department_url, $headers); my $department_res = $ua->request($department_req); unless ($department_res->is_success) { die "Error fetching list of constituencies: $department_url: ", $department_res->status_line, "\n"; } $constituencies_html = latin1($department_res->content)->utf8; @constituencies = ($constituencies_html =~ /^c\d+ = "([^"]+)"/gom); } # Hack for DOM/TOM with one contituency by department else { $constituencies_html = " "; push @constituencies, $department_name; } $constituencies_html =~ s/\015\012?/\n/go; # Parse each constituency foreach my $constituency_line (split /\n/, $constituencies_html) { next unless $constituency_line =~ /^\s+new(GET => $mp_url, $headers); my $mp_res = $ua->request($mp_req); unless ($mp_res->is_success) { die "Error fetching deputy: $mp_url: ", $mp_res->status_line, "\n"; } my $mp_html = latin1($mp_res->content)->utf8; # Skip vacant posts next if $mp_html =~ /Si(?:è|è)ge vacant/o; # Extract infos # Name (my $mp_name) = ($mp_html =~ /

([^>]+)<\/h1>/o); my $mp_wiki_name = wikify($mp_name); next if (defined $this_mp and $mp_wiki_name ne $this_mp); (my $mp_first_name = $mp_name) =~ s/ ([-'\wÀÂÄÉÈÊËÍÎÏÓÔÖÙÛÜÇàâäéèêëíîïóôöùûüç]+)$//o; my $mp_last_name = $1; # Hack for compound name with single quote (eg. Louis Giscard d'Estaing) if ($mp_last_name =~ /'/o and $mp_first_name =~ /^([-'\wÀÂÄÉÈÊËÍÎÏÓÔÖÙÛÜÇàâäéèêëíîïóôöùûüç]+) ([-'\wÀÂÄÉÈÊËÍÎÏÓÔÖÙÛÜÇàâäéèêëíîïóôöùûüç]+)$/o) { $mp_first_name = $1; $mp_last_name = "$2 $mp_last_name"; } # Hack for compound name with particule (eg. Jean de Gaulle) if ($mp_first_name =~ /^([-'\wÀÂÄÉÈÊËÍÎÏÓÔÖÙÛÜÇàâäéèêëíîïóôöùûüç]+) (de)$/o) { $mp_first_name = $1; $mp_last_name = "$2 $mp_last_name"; } # Gender (my $mp_gender) = ($mp_html =~ /

Informations générales<\/h1>