Ucpt:Extract.pl
From Gentoo Linux Wiki
This script parses the Gentoo-Wiki's sql-database dump files to create the UCPT ebuilds, files, structure, and index. The daily backup files needed for extraction can be found here
| Code: extract.pl |
|
#!/usr/bin/perl
#
# ebuild and file extraction script for the UCPT database on
# gentoo-wiki.com. See http://gentoo-wiki.com/UCPT.
#
# UCPT - User-Contributed Portage Tree
#
#
# Usage: download page_table.sql.gz and text_table.sql.gz and
# run 'perl extract.pl' where you have put the gzipped sql dump files.
#
# Note: does not remove 'stale' files (ie. those which have been
# removed from the Wiki)
#
#
# You need to be in the 'portage' user group in order to
# let this script build the digests for you.
#
$create_digests = 1;
# create Wiki index pages for UCPT namespace (eg. gentoo-wiki.com/Index:UCPT)?
$create_index_pages = 1;
$index_page_filename = "UCPT_index_page.txt";
$ebuild_index_page_filename = "UCPT_ebuild_index_page.txt";
@portage_categories = ();
undef %ebuild_wikinames_hash; # for ebuild index creation
undef %descriptions_by_wikiname;
# The mediawiki namespace ID ("Ucpt:" on gentoo-wiki.com has ID 110)
$mynamespace = 110;
$mynamespace_str = "Ucpt:";
# where to create the user-contributed portage tree?
$basedir = ".";
#
# Create the digest files after having created _all_ ebuilds.
# It's not really necessary to do that afterwards.
#
@ebuild_files = ();
$error_ret_code = 0;
#
# Read in relevant page names, their dates and ids.
#
print "Parsing page_table...\n";
undef %wikiname_by_revid;
open PAGEDB, "gzip -dc page_table.sql.gz |";
$page_count = 0;
while (<PAGEDB>) {
chomp;
foreach (split(/\),\(/))
{
if (s/[0-9]+,$mynamespace,\'([^\'\/]+\/[^\'\/]+\/[^\']+)\',\'.*\',[0-9]+,[0-9]+,[0-9]+,[0-9\.]+,\'([0-9]+)\',([0-9]+),[0-9]+/$revid=$3;$wikiname=$1;""/ei)
{
#print "Detected UCPT page: $wikiname ($revid)\n";
$wikiname_by_revid{$revid} = $wikiname;
$page_count++;
}
}
}
close PAGEDB;
print "Detected $page_count pages.\n";
print "Parsing revision table...\n";
undef %wikiname_by_pageid;
undef %wikidate_by_pageid;
open REVDB, "gzip -dc revision_table.sql.gz |";
$page_count = 0;
while(<REVDB>)
{
chomp;
foreach (split(/\),\(/))
{
if(s/([0-9]+),([0-9]+),'.*',[0-9]+,'.*','([0-9]+)',[0-9]+,[0-9]+,([0-9]+)/$revid=$1;$wikidate=$3;$pageid=$4;""/e && defined($wikiname_by_revid{$revid})) {
$wikiname_by_pageid{$pageid} = $wikiname_by_revid{$revid};
$wikidate_by_pageid{$pageid} = $wikidate;
$page_count++;
}
}
}
close REVDB;
print "Found $page_count revisions.\n";
undef %wikiname_by_revid;
#
# Scan text db...
#
print "Parsing text_table...\n";
open TEXTDB, "gzip -dc text_table.sql.gz |";
while (<TEXTDB>) {
chomp;
foreach (split(/\),\(/))
{
$wikiname = "";
if(s/([0-9]+),\'/$pageid=$1;""/ei && ($wikiname = $wikiname_by_pageid{$pageid}))
{
$content = "";
s/^(.*)\',\'[^\']*\'/$content=$1;""/ei;
$date = substr($wikidate_by_pageid{$pageid},0,12);
substr($wikiname,0,1) =~ tr/[A-Z]/[a-z]/;
# remember portage categories and ebuild wikinames for index page creation
remember_portage_categories_and_ebuild_wikinames($wikiname);
# create file
if(($wikiname =~ /\.ebuild$/) || ($wikiname =~ /^[a-zA-Z0-9]+-[a-zA-Z0-9]+\/[^\/]+\/files\//))
{
my $is_ebuild = 0;
#
# use date of wiki post as release tag
# -- not nice but simple and efficient
#
$filename = $wikiname;
if ( $filename =~ s/\.ebuild$/-r$date.ebuild/ ) { $is_ebuild = 1; }
my $sysfilename = $basedir."/".$filename;
#
# File content is enclosed in <pre> and </pre> tags.
# Outside of these tags, one may place comments which will
# only show up on the Wiki, but make sure there is only one
# <pre> and only one </pre> tag in the whole Wiki text!
#
$content =~ s/^.*<pre>(.*)<\/pre>.*$/$1/i;
#
# Convert back escaped chars...
# FIXME: need to do proper unescaping during
# parsing of the SQL command!
#
$content =~ s/\\n/\n/g;
$content =~ s/\\"/"/g;
$content =~ s/\\'/'/g;
$content =~ s/\\\\/\\/g;
$is_ebuild and store_ebuild_description($wikiname,$content);
#
# keep last-modified time of existing _ebuild_ files
#
if ( ! $is_ebuild || ! -f "$sysfilename" )
{
print "$filename\n";
#print "sysfilename=$sysfilename\n";
my $sysdirname = $sysfilename;
if($sysdirname =~ s/\/[^\/]+$//)
{
#print "sysdirname=$sysdirname\n";
system("mkdir -p \"$sysdirname\"");
}
open FILE, "> $sysfilename";
print FILE $content;
close FILE;
{
my $portage_category = $filename;
$portage_category =~ s/^([^\/]+)\/.*$/$1/;
my ($main,$sub) = split ( '-', $portage_category, 2 );
if ( defined($portage_categories{$main}) ) {
$portage_categories{$main} .= ",".$sub;
} else {
$portage_categories{$main} = $sub;
}
}
}
#
# Remember ebuild filename for digest creation
#
if ( $is_ebuild ) {
push @ebuild_files, $sysfilename;
}
}
}
}
}
close TEXTDB;
print "Extraction done.\n";
#
# create digests
#
while ( $ebuild_filename = pop @ebuild_files ) {
#
# digest creation necessary?
#
my $digest_filename = $ebuild_filename;
if ( $digest_filename =~ s/\/([^\/]+)\.ebuild$/\/files\/digest-$1/ ) {
if ( ! -f "$digest_filename" ) {
$digest_filename =~ s/files\/digest-[^\/]+$/Manifest/;
system ( "echo > \"$digest_filename\"" ) if ( ! -f "$digest_filename" );
system ( "ebuild \"$ebuild_filename\" manifest" ) == 0 or $error_ret_code = 1;
}
} else {
print " * * * * ****** ERROR! ****** * * * *\n";
$error_ret_code = 1;
}
}
#
# create Wiki index pages:
# 1. portage categories
# 2. portage categories containing all ebuilds
#
if ( $create_index_pages ) {
my $file_handle;
open $file_handle, "> $index_page_filename";
my $ebuild_index_handle;
open $ebuild_index_handle, "> $ebuild_index_page_filename";
#
# get sorted and unique list of main/major portage categories
#
my @tmp_main_categories = ();
my %main_existence_hash;
my $i;
for ( $i = 0; $i <= $#portage_categories; $i++ ) {
my ($main,$sub) = split ( '-', $portage_categories[$i], 2 );
if ( ! defined($main_existence_hash{$main}) ) {
push @tmp_main_categories, $main;
$main_existence_hash{$main} = 1;
}
}
undef %main_existence_hash;
my @sorted_unique_main_categories = sort @tmp_main_categories;
#print join(":",@sorted_unique_main_categories)."\n";
for ( $i = 0; $i <= $#sorted_unique_main_categories; $i++ ) {
my $main_category = $sorted_unique_main_categories[$i];
print $file_handle "* $main_category\n";
print $ebuild_index_handle "* $main_category\n";
#
# get sorted and unique list of sub/minor portage categories
#
my @tmp_sub_categories = ();
my %sub_existence_hash;
my $j;
for ( $j = 0; $j <= $#portage_categories; $j++ ) {
my ($main,$sub) = split ( '-', $portage_categories[$j], 2 );
if ( $main_category eq $main && !defined($sub_existence_hash{$sub}) ) {
push @tmp_sub_categories, $sub;
$sub_existence_hash{$sub} = 1;
}
}
undef %sub_existence_hash;
my @sorted_unique_sub_categories = sort @tmp_sub_categories;
#print join(":",@sorted_unique_sub_categories)."\n";
my $k;
for ( $k = 0; $k <= $#sorted_unique_sub_categories; $k++ ) {
my $sub_category = $sorted_unique_sub_categories[$k];
print $file_handle "** $sub_category\n";
print $ebuild_index_handle "** $sub_category\n";
my @ebuild_wikinames =
split ( "\t", $ebuild_wikinames_hash{$main_category.'-'.$sub_category} );
my @sorted_ebuild_wikinames = sort @ebuild_wikinames;
my $l;
for ( $l = 0; $l <= $#sorted_ebuild_wikinames; $l++ ) {
my $wikiname = $sorted_ebuild_wikinames[$l];
my $nice_wikiname = $wikiname;
$nice_wikiname =~ s/^.*\/([^\/]+)\.ebuild$/$1/;
print $ebuild_index_handle '*** [['.$mynamespace_str.$wikiname.'|'.
$nice_wikiname.']] '.$descriptions_by_wikiname{$wikiname}."\n";
}
}
print $file_handle "\n";
print $ebuild_index_handle "\n";
}
close $ebuild_index_handle;
close $file_handle;
}
if ( $error_ret_code == 0 ) {
print "All OK.\n";
}
elsif ( $error_ret_code == 1 ) {
print "There were errors during the creation of the digest files!\n";
}
exit $error_ret_code;
sub remember_portage_categories_and_ebuild_wikinames
{
# categories
my $portage_category = @_[0];
$portage_category =~ s/^([^\/]+)\/.*$/$1/;
push @portage_categories, $portage_category;
# ebuild wikinames
my $name = @_[0];
if ( $name =~ /\.ebuild$/ ) {
if(!defined($ebuild_wikinames_hash{$portage_category})) {
$ebuild_wikinames_hash{$portage_category} = $name;
} else {
$ebuild_wikinames_hash{$portage_category} .= "\t".$name;
}
}
}
sub store_ebuild_description {
my ($wikiname,$content) = @_;
my $desc = "";
#print "* $wikiname\n";
if($content =~ s/(^|\n)\s*DESCRIPTION\s*=\s*"(.+)"(\s*|\s*#.*)(\n|$)/$desc=$2;""/ei) {
#print "** $desc\n";
$descriptions_by_wikiname{$wikiname} = $desc;
}
}
|
