head 1.7; access; symbols; locks; strict; comment @# @; 1.7 date 2004.11.17.08.53.21; author rse; state Exp; branches; next 1.6; 1.6 date 2004.11.16.16.23.28; author rse; state Exp; branches; next 1.5; 1.5 date 2004.11.16.16.20.27; author rse; state Exp; branches; next 1.4; 1.4 date 2004.11.16.14.05.32; author hms; state Exp; branches; next 1.3; 1.3 date 2004.11.08.15.32.59; author rse; state Exp; branches; next 1.2; 1.2 date 2004.11.08.15.26.01; author rse; state Exp; branches; next 1.1; 1.1 date 2004.10.25.14.35.11; author rse; state Exp; branches; next ; desc @@ 1.7 log @fix logic for SRC/00INDEX.rdf.bz2 files where both rdf:Description and Repository containers are inside a top-level Repository container @ text @#!/usr/lpkg/bin/perl ## ## OSSP quos - Query On Steroids ## Copyright (c) 2004 Ralf S. Engelschall ## Copyright (c) 2004 The OSSP Project ## ## This file is part of OSSP quos, a Web user interface for querying ## a database which can be found at http://www.ossp.org/pkg/tool/quos/. ## ## This program is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation; either version 2 of the License, or ## (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with this program; if not, write to the Free Software ## Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, ## USA, or contact Ralf S. Engelschall . ## ## openpkg-rdf2sql.cfg: OpenPKG XML/RDF to SQL importing tool ## require 5.008; use strict; use Getopt::Long; use XML::Parser; use DBI; use DBD::SQLite; use Data::Dumper; use XML::Simple; use LWP::UserAgent; use IPC::Filter qw(); # configure optional debugging $Data::Dumper::Purity = 1; $Data::Dumper::Indent = 1; $Data::Dumper::Terse = 1; my $ftpserv = 'ftp://anonymous@@ftp.openpkg.org/'; my $starturl = '00INDEX.rdf'; # connect to database my $db = DBI->connect("dbi:SQLite:dbname=openpkg.db", "", "", { RaiseError => 1, AutoCommit => 1 }); # prepare SQL commands my $sql = {}; $sql->{-rdf} = $db->prepare( "INSERT INTO quos_rdf (rd_url) VALUES (?);" ); $sql->{-package} = $db->prepare( "INSERT INTO quos_package" . " (pk_name, pk_version, pk_release, pk_distribution, pk_group," . " pk_license, pk_packager, pk_summary, pk_url, pk_vendor, pk_description, pk_rdf)" . " VALUES (?,?,?,?,?,?,?,?,?,?,?,(SELECT MAX(rd_id) FROM quos_rdf));\n" ); $sql->{-buildprereq} = $db->prepare( "INSERT INTO quos_buildprereq" . " (bp_id, bp_key, bp_op, bp_val)" . " VALUES ((SELECT MAX(pk_id) FROM quos_package),?,?,?);" ); $sql->{-prereq} = $db->prepare( "INSERT INTO quos_prereq" . " (rp_id, rp_key, rp_op, rp_val)" . " VALUES ((SELECT MAX(pk_id) FROM quos_package),?,?,?);" ); $sql->{-provide} = $db->prepare( "INSERT INTO quos_provide" . " (pr_id, pr_key, pr_op, pr_val)" . " VALUES ((SELECT MAX(pk_id) FROM quos_package),?,?,?);" ); $sql->{-source} = $db->prepare( "INSERT INTO quos_source" . " (sr_id, sr_url)" . " VALUES ((SELECT MAX(pk_id) FROM quos_package),?);" ); &fetchrdfsrecursendump2db($starturl); sub fetchrdfsrecursendump2db { my ($url) = @@_; # fetch XML/RDF index file my $ua = new LWP::UserAgent; $ua->agent("openpkg-rdf2sql/1.0"); my $req = new HTTP::Request(GET => $ftpserv . $url); my $rescont = ''; my $res = $ua->request($req); if ($res->is_success) { if ($url =~ m|.*bz2$|) { $rescont = IPC::Filter::filter($res->content, "bzip2 -d"); } else { $rescont = $res->content; } } else { print $res->status_line . "\n"; } # parse XML/RDF index file my $xml = new XML::Simple; my $rdf = $xml->XMLin( $rescont, KeepRoot => 1, ForceContent => 0, ForceArray => 1, ); undef $xml; # iterate over XML/RDF data structure foreach my $repo (@@{$rdf->{'rdf:RDF'}->[0]->{'Repository'}}) { my $rd_url = $repo->{'rdf:resource'}; # start SQL transaction $db->begin_work(); # store repository information $sql->{-rdf}->execute($rd_url); # iterate over all packages in a repository if (defined($repo->{'rdf:Description'})) { foreach my $desc (@@{$repo->{'rdf:Description'}}) { # store simple (single-value) properties of a package my $prop = {}; foreach my $attr (qw( Name Version Release Distribution Group License Packager Summary URL Vendor Description )) { $prop->{$attr} = $desc->{$attr}->[0]; } $sql->{-package}->execute( $prop->{'Name'}, $prop->{'Version'}, $prop->{'Release'}, $prop->{'Distribution'}, $prop->{'Group'}, $prop->{'License'}, $prop->{'Packager'}, $prop->{'Summary'}, $prop->{'URL'}, $prop->{'Vendor'}, $prop->{'Description'} ); # store complex (multi-value) properties of a package foreach my $attr (qw( BuildPreReq PreReq Provide )) { foreach my $el (@@{$desc->{$attr}->[0]->{'rdf:bag'}->[0]->{'resource'}}) { my ($key, $op, $val) = ($el, '=', '*'); if (ref($key) eq 'HASH') { $key = $el->{'content'}; $op = (grep { $_ ne 'content' } keys(%{$el}))[0]; $val = $el->{$op}; } $sql->{"-".lc($attr)}->execute($key, $op, $val); } } foreach my $url (@@{$desc->{'Source'}->[0]->{'rdf:bag'}->[0]->{'rdf:li'}}) { $sql->{-source}->execute($url); } } } # iterate over all sub-repositities in repository if (defined($repo->{'Repository'})) { foreach my $repcont (@@{$repo->{'Repository'}}) { $url =~ m|^(.*/)|; my $actpath = $1; &fetchrdfsrecursendump2db($actpath . $repcont->{'href'}); sleep(1); } } # end SQL transaction $db->commit(); } } # commit and disconnect from database $db->disconnect(); # temporary function for showing actual XML sub showactxml { my ($xmlinput) = @@_; my $xml = new XML::Simple; my $rdf = $xml->XMLin( $xmlinput, KeepRoot => 1, ForceContent => 0, ForceArray => 1, ); undef $xml; print Dumper($rdf); } __END__ =pod =head1 NAME B - OpenPKG XML/RDF to SQL importing tool =head1 SYNOPSIS B [B<-v>|B<--verbose>] ...FIXME... I =head1 DESCRIPTION =cut @ 1.6 log @I'm not 100% sure but for 99% of certainty I think we should use smaller transactions: 1. it is unnecessary to be able to roll back such much, 2. the foreign key determination could be wrong?, 3. the whole crawling takes long and a transaction should not @ text @d119 3 a123 1 $db->begin_work(); a127 1 # from now on package descriptions d164 3 a166 2 else { # from now on RDF references d174 2 @ 1.5 log @cleanup style @ text @a82 2 $db->begin_work(); d121 1 a121 1 #$db->begin_work(); d172 1 a172 1 #$db->commit(); a176 1 $db->commit(); @ 1.4 log @appended recursive search for rdfs to rdf2sql tool @ text @d44 1 a44 1 my $ftpserv = 'ftp://anonymous:herb@@ftp.openpkg.org/'; a87 1 d89 2 d92 1 a92 1 $ua->agent("rdfcrawl/1.0 "); a94 1 d97 1 a97 2 $_ = $url; if (m|.*bz2$|) { d105 1 a105 1 print $res->status_line, "\n"; d108 1 d118 1 a118 1 # iterate over XML/RDF data structure d123 1 a123 1 # $db->begin_work(); d126 1 a126 1 # interate over all packages in a repository d128 1 a128 1 # from now on package descriptions d166 1 a166 1 # from now on rdf references d168 4 a171 4 $url =~ m|^(.*/)|; my $actpath = $1; &fetchrdfsrecursendump2db($actpath . $repcont->{'href'}); sleep(1); d174 1 a174 1 # $db->commit(); d177 2 a178 1 # commit and disconnect from database d182 1 a183 2 # temporary function for showing actual xml a186 1 d193 2 a194 2 undef $xml; print Dumper($rdf); @ 1.3 log @use strict-OO style @ text @d1 1 a1 1 #!/usr/opkg/bin/perl d29 1 d36 2 d44 2 a45 10 my $xml = new XML::Simple; my $rdf = $xml->XMLin( "openpkg.rdf", KeepRoot => 1, ForceContent => 0, ForceArray => 1, ); undef $xml; #print Dumper($xml); #exit(0); d83 3 a85 3 # iterate over XML/RDF data structure foreach my $repo (@@{$rdf->{'rdf:RDF'}->[0]->{'Repository'}}) { my $rd_url = $repo->{'rdf:resource'}; d87 16 a102 13 # store repository information $db->begin_work(); $sql->{-rdf}->execute($rd_url); # interate over all packages in a repository foreach my $desc (@@{$repo->{'rdf:Description'}}) { # store simple (single-value) properties of a package my $prop = {}; foreach my $attr (qw( Name Version Release Distribution Group License Packager Summary URL Vendor Description )) { $prop->{$attr} = $desc->{$attr}->[0]; d104 58 a161 18 $sql->{-package}->execute( $prop->{'Name'}, $prop->{'Version'}, $prop->{'Release'}, $prop->{'Distribution'}, $prop->{'Group'}, $prop->{'License'}, $prop->{'Packager'}, $prop->{'Summary'}, $prop->{'URL'}, $prop->{'Vendor'}, $prop->{'Description'} ); # store complex (multi-value) properties of a package foreach my $attr (qw( BuildPreReq PreReq Provide )) { foreach my $el (@@{$desc->{$attr}->[0]->{'rdf:bag'}->[0]->{'resource'}}) { my ($key, $op, $val) = ($el, '=', '*'); if (ref($key) eq 'HASH') { $key = $el->{'content'}; $op = (grep { $_ ne 'content' } keys(%{$el}))[0]; $val = $el->{$op}; a162 1 $sql->{"-".lc($attr)}->execute($key, $op, $val); d165 8 a172 2 foreach my $url (@@{$desc->{'Source'}->[0]->{'rdf:bag'}->[0]->{'rdf:li'}}) { $sql->{-source}->execute($url); d174 1 a175 1 $db->commit(); d177 3 d181 16 a196 2 # disconnect from database $db->disconnect(); @ 1.2 log @first cut for RDF to SQL translator @ text @d41 2 a42 1 my $rdf = XML::Simple::XMLin( d48 1 @ 1.1 log @commit initial work on OSSP quos @ text @d33 2 d36 99 a134 1 # ...FIXME... @