#!/usr/bin/perl # Copyright 2007 Gérald Sédrati-Dinet # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA use strict; use warnings; use Sleepycat::DbXml 'simple'; use XML::Twig; use LWP::UserAgent; use Getopt::Long; # Some files will be save under tree hierarchy based on executable location use FindBin qw($Bin); use lib "$Bin"; # Default values my $path2DbEnv = "$Bin/dbxml"; # Parse command line options Getopt::Long::Configure("bundling"); GetOptions( 'path2dbenv|p=s' => \$path2DbEnv, 'help|h' => sub { print STDERR <] where is the wiki name of the politician and can be candidates, mps, meps or ministers (for convenience the final "s" can be omitted, also ".dbxml" can be appended) Options can be: --path2dbenv, -p : path to directory of DB XML database default: $Bin/dbxml --help, -h: print this message USAGE exit 0; } ); my $wiki_name = shift or die "You should provide the wiki name of the politician, see $0 -h\n"; my $theContainer = shift or die "You should provide the name of the container, see $0 -h\n"; $theContainer =~ s/s?(?:\.dbxml)?$/s/; # Create a user agent object my $ua = LWP::UserAgent->new; $ua->agent("$0 (gibus perl script to fetch information for wiki.ffii.fr)"); my $headers = HTTP::Headers->new('Accept-Language' => 'fr'); eval { # Open a container in the db environment my $env = new DbEnv(0); $env->set_cachesize(0, 64 * 1024, 1); print "Opening $path2DbEnv\n"; $env->open($path2DbEnv, Db::DB_INIT_MPOOL|Db::DB_CREATE|Db::DB_INIT_LOCK|Db::DB_INIT_LOG); my $theMgr = new XmlManager($env); my $container = $theMgr->openContainer("$theContainer.dbxml"); my $context = $theMgr->createQueryContext(XmlQueryContext::LiveValues, XmlQueryContext::Eager); my $updateContext = $theMgr->createUpdateContext(); # Get the document my $theDocument = $container->getDocument($wiki_name); my $docString = $theDocument->getContent(); # Process this politician infos in XML format my $xml = XML::Twig->new(output_encoding => 'UTF-8',pretty_print => 'indented')->parse($docString); # Fetch wikipedia page for this politician my $first_name = $xml->root->first_child('infos')->first_child('name')->first_child_text('first'); my $last_name = $xml->root->first_child('infos')->first_child('name')->first_child_text('last'); my $wikipedia_url = "http://fr.wikipedia.org/wiki/${first_name}_${last_name}"; my $wikipedia_req = HTTP::Request->new(GET => $wikipedia_url, $headers); my $wikipedia_res = $ua->request($wikipedia_req); unless ($wikipedia_res->is_success) { die "Error fetching wikipedia page: $wikipedia_url: ", $wikipedia_res->status_line, "\n"; } my $wikipedia_html = $wikipedia_res->content; # Find websites on the wikipedia page my $found = 0; while ($wikipedia_html =~ /.*?(Site officiel|Blog|Site Web|Site de campagne) d[e']/g) { my ($web, $type) = (lc $1, lc $2); # Fix strange supplementary slash in some wikipedia links $web =~ s!http:///!http://!o; print "Found $type=<$web> for $first_name $last_name\n"; # Insert web sites found into XML my $parliament_url = $xml->root->first_child('contact')->first_child('web[@type]'); unless (grep {(my $a=$_->text)=~s!/(?:index\.\w+)?$!!;(my $b=$web)=~s!/(?:index\.\w+)?$!!;$a eq $b} $xml->root->first_child('contact')->children('web')) { $parliament_url->insert_new_elt('after','web'=>{type=>$type }, $web); $found ++; print "Added $type=<$web> for $first_name $last_name\n"; } } # Update document in database if ($found) { $docString = $xml->sprint; $theDocument->setContent( $docString ); $container->updateDocument($theDocument, $updateContext); $xml->purge; } exit 0; } ; if (my $e = catch std::exception) { warn "Exception:\n"; warn $e->what() . "\n"; exit( -1 ); } elsif ($@) { warn "Query failed\n"; warn $@; exit( -1 ); }