#!/usr/bin/perl # Copyright 2007 Gérald Sédrati-Dinet # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA use strict; use warnings; use Sleepycat::DbXml 'simple'; use XML::Twig; use LWP::UserAgent; use Getopt::Long; # Some files will be save under tree hierarchy based on executable location use FindBin qw($Bin); use lib "$Bin"; # Default values my $path2DbEnv = "$Bin/dbxml"; # Parse command line options Getopt::Long::Configure("bundling"); GetOptions( 'path2dbenv|p=s' => \$path2DbEnv, 'help|h' => sub { print STDERR <] where can be candidates, mps, meps or ministers (for convenience the final "s" can be omitted, also ".dbxml" can be appended) Options can be: --path2dbenv, -p : path to directory of DB XML database default: $Bin/dbxml --help, -h: print this message USAGE exit 0; } ); my $theContainer = shift or die "You should provide the name of the container, see $0 -h\n"; $theContainer =~ s/s?(?:\.dbxml)?$/s/; # Create a user agent object my $ua = LWP::UserAgent->new; $ua->agent("$0 (gibus perl script to fetch information for wiki.ffii.fr)"); my $headers = HTTP::Headers->new('Accept-Language' => 'fr'); eval { # Open a container in the db environment my $env = new DbEnv(0); $env->set_cachesize(0, 64 * 1024, 1); print "Opening $path2DbEnv\n"; $env->open($path2DbEnv, Db::DB_INIT_MPOOL|Db::DB_CREATE|Db::DB_INIT_LOCK|Db::DB_INIT_LOG); my $theMgr = new XmlManager($env); my $container = $theMgr->openContainer("$theContainer.dbxml"); my $context = $theMgr->createQueryContext(XmlQueryContext::LiveValues, XmlQueryContext::Eager); # Get all documents my $query = "collection('".$container->getName()."')/politician"; my $results = $theMgr->query($query, $context ); print "Found ".$results->size()." documents matching the expression '$query'\n\n"; my $updateContext = $theMgr->createUpdateContext(); my $theDocument = $theMgr->createDocument(); while( $results->next($theDocument) ) { # Process this politician infos in XML format my $docString = $theDocument->getContent(); my $xml = XML::Twig->new(output_encoding => 'UTF-8',pretty_print => 'indented')->parse($docString); # Fetch wikipedia page for this politician my $first_name = $xml->root->first_child('infos')->first_child('name')->first_child_text('first'); my $last_name = $xml->root->first_child('infos')->first_child('name')->first_child_text('last'); my $wikipedia_url = "http://fr.wikipedia.org/wiki/${first_name}_${last_name}"; my $wikipedia_req = HTTP::Request->new(GET => $wikipedia_url, $headers); my $wikipedia_res = $ua->request($wikipedia_req); unless ($wikipedia_res->is_success) { die "Error fetching wikipedia page: $wikipedia_url: ", $wikipedia_res->status_line, "\n"; } my $wikipedia_html = $wikipedia_res->content; # Find websites on the wikipedia page my $found = 0; while ($wikipedia_html =~ /.*?(Site officiel|Blog|Site Web|Site de campagne) d[e']/g) { my ($web, $type) = (lc $1, lc $2); # Fix strange supplementary slash in some wikipedia links $web =~ s!http:///!http://!o; print "Found $type=<$web> for $first_name $last_name\n"; # Insert web sites found into XML my $parliament_url = $xml->root->first_child('contact')->first_child('web[@type]'); unless (grep {(my $a=$_->text)=~s!/(?:index\.\w+)?$!!;(my $b=$web)=~s!/(?:index\.\w+)?$!!;$a eq $b} $xml->root->first_child('contact')->children('web')) { $parliament_url->insert_new_elt('after','web'=>{type=>$type }, $web); $found ++; print "Added $type=<$web> for $first_name $last_name\n"; } } # Update document in database if ($found) { $docString = $xml->sprint; $theDocument->setContent( $docString ); $container->updateDocument($theDocument, $updateContext); $xml->purge; } } exit 0; } ; if (my $e = catch std::exception) { warn "Exception:\n"; warn $e->what() . "\n"; exit( -1 ); } elsif ($@) { warn "Query failed\n"; warn $@; exit( -1 ); }