CPAN-Search-Lite documentation

CPAN::Search::Lite::Index - set up or update database tables.

Code Index:


__top


NAME

CPAN::Search::Lite::Index - set up or update database tables.

__top


SYNOPSIS

 my $index = CPAN::Search::Lite::Index->new(config => 'cpan.conf', setup => 1);
 $index->index();

__top


DESCRIPTION

This is the main module used to set up or update the database tables used to store information from the CPAN and ppm indices. The creation of the object

 my $index = CPAN::Search::Lite::Index->new(%args);

accepts three arguments:

__top


CONFIGURATION

Most of the options used to control the behaviour of the indexing are contained in a configuration file. An example of the format of such a file is

 [CPAN]
 CPAN = /var/ftp/pub/CPAN
 pod_root = /usr/local/POD
 html_root = /usr/local/httpd/htdocs/CPAN
 [DB]
 db = pause
 user = sarah
 passwd = lianne
 [WWW]
 css = cpan.css
 up_img = up.gif
 tt2 = /usr/local/tt2
 geoip = /usr/local/share/geoip/cpan.txt

This consists of 3 sections.

CPAN

This is associated with various things related to CPAN.

DB

This is used to store connection information to the database used to populate the tables.

WWW

This is used for various information related to a web interface.

__top


DETAILS

Calling

  $index->index();

will start the indexing procedure. Various messages detailing the progress will written to STDOUT, which by default will be captured into a file cpan_search_log.dddddddddd, where the extension is the time that the method was invoked. Passing index an argument of log => log_file will save these messages into log_file. Error messages are not captured, and will appear in STDERR.

The steps of the indexing procedure are as follows.

__top


SEE ALSO

the CPAN::Search::Lite::Info manpage, the CPAN::Search::Lite::PPM manpage, the CPAN::Search::Lite::State manpage, the CPAN::Search::Lite::Extract manpage, the CPAN::Search::Lite::Populate manpage, and the CPAN::Search::Lite::Util manpage. Development takes place on the CPAN-Search-Lite project at http://sourceforge.net/projects/cpan-search/.

__top


COPYRIGHT

This software is copyright 2004 by Randy Kobes <randy@theoryx5.uwinnipeg.ca>. Use and redistribution are under the same terms as Perl itself.

__top


package CPAN::Search::Lite::Index;
use strict;
use warnings;
use CPAN::Search::Lite::Info;
use CPAN::Search::Lite::PPM;
use CPAN::Search::Lite::Extract;
use CPAN::Search::Lite::State;
use CPAN::Search::Lite::Populate;
use Config::IniFiles;
use File::Spec::Functions qw(catfile);
use File::Basename;
use File::Path;
use LWP::Simple qw(getstore is_success);
use Locale::Country;
use CPAN::Search::Lite::DBI qw($tables);

our ($oldout, $VERSION);
$VERSION = 0.66;

sub new {
    my ($class, %args) = @_;

    my $env_cfg = $ENV{CSL_CONFIG_FILE};
    if ($env_cfg and not -f $env_cfg) {
      die qq{\$ENV{CSL_CONFIG_FILE} = "$env_cfg" not found};
    }
    my $opt_cfg = $args{config};
    if ($opt_cfg and not -f $opt_cfg) {
      die qq{Config file "$opt_cfg" not found};
    }
    if ($env_cfg) {
      if (not $opt_cfg) {
        print qq{Using config file "$env_cfg"\n};
        $args{config} = $env_cfg;
      }
      else {
        print qq{Using config file "$opt_cfg"\n};        
      }
    }
    elsif ($opt_cfg) {
      print qq{Using config file "$opt_cfg"\n};
    }
    else {
      die <<"DEATH";

No configuration file found. Please specify one
either by the "config" option or by setting the
environment variable CSL_CONFIG_FILE.

DEATH
    }

    if ($args{setup} and $args{reindex}) {
      die "Reindexing must be done on an exisiting database";
    }

    read_config(\%args);
    $args{no_ppm} = 1 if ($args{reindex});
    foreach (qw(CPAN db user passwd) ) {
        die "Must supply a '$_' argument" unless $args{$_};
    }
    unless ($args{no_mirror}) {
        foreach (qw(pod_root html_root)) {
            die "Must supply a '$_' argument" unless $args{$_};
        }
    }

    my $self = { index => undef,
                 state => undef,
                 %args,
             };
    bless $self, $class;
}


sub read_config {
    my $args = shift;
    my $cfg = Config::IniFiles->new(-file => $args->{config});
    my $section = 'CPAN';
    my @wanted = qw(CPAN pod_root html_root no_mirror no_cat pod_only split_pod
                cat_threshold no_ppm remote_mirror multiplex);
    my %has = map {$_ => 1} (@wanted, 'ignore');
    foreach ($cfg->Parameters($section)) {
        die "Invalid parameter: $_, in section $section" unless $has{$_};
    }
    foreach (@wanted) {
        $args->{$_} = $cfg->val($section, $_) if $cfg->val($section, $_);
    }
    if ($cfg->val($section, 'ignore')) {
        my @values = $cfg->val($section, 'ignore');
        $args->{ignore} = \@values;
    }
    $section = 'DB';
    @wanted = qw(db user passwd);
    %has = map {$_ => 1} @wanted;
    foreach ($cfg->Parameters($section)) {
        die "Invalid parameter: $_, in section $section" unless $has{$_};
    }
    foreach (@wanted) {
        $args->{$_} = $cfg->val($section, $_) if $cfg->val($section, $_);
    }
    $section = 'WWW';
    @wanted = qw(tt2 css geoip up_img);
    %has = map {$_ => 1} @wanted;
    foreach ($cfg->Parameters($section)) {
        die "Invalid parameter: $_, in section $section" unless $has{$_};
    }
    foreach (@wanted) {
        $args->{$_} = $cfg->val($section, $_) if $cfg->val($section, $_);
    }
}

sub index {
    my ($self, %args) = @_;
    my $log_dir = dirname($self->{config}) || '.';
    my $log_file = $args{log} || 'cpan_search_log.' . time;
    my $log = catfile $log_dir, $log_file;
    $oldout = error_fh($log);
    if ($self->{rebuild_info}) {
      return $self->rebuild_info();
    }
    if ($self->{no_mirror}) {
        my %wanted = map{$_ => $self->{$_}} qw(remote_mirror);
        $self->no_mirror(%wanted);
    }
    my %wanted = map{$_ => $self->{$_}} qw(CPAN tt2 geoip multiplex);
    write_mirror_data(%wanted);

    $self->fetch_info or return;
    unless ($self->{setup}) {
        $self->state or return;
    }
    unless ($self->{no_mirror}) {
        $self->extract or return;
    }
    $self->populate or return;
    return 1;
}

sub rebuild_info {
  my $self = shift;
  my %wanted = map {$_ => $self->{$_}} qw(db user passwd);
  my $cdbi = CPAN::Search::Lite::DBI::Index->new(%wanted) or return;
  foreach my $table(qw(chapters reps)) {
    my $obj = $cdbi->{objs}->{$table};
    next unless my $schema = $obj->schema($tables->{$table});
    $obj->drop_table or die "Dropping table $table failed";
    $obj->create_table($schema) or die "Creating table $table failed";
    $obj->populate or die "Populating $table failed";
  }
  return 1;
}

sub no_mirror {
    my ($self, %args) = @_;
    my $indices = {'MIRRORED.BY' => '.',
                   '01mailrc.txt.gz' => 'authors',
                   'ls-lR.gz' => 'indices',
                   '02packages.details.txt.gz' => 'modules',
                   '03modlist.data.gz' => 'modules',
               };
    my $cpan = $args{remote_mirror} || 'http://www.cpan.org';
    foreach my $index (keys %$indices) {
        my $file = catfile $self->{CPAN}, $indices->{$index}, $index;
        next if (-e $file and -M $file < 0);
        my $dir = dirname($file);
        unless (-d $dir) {
            mkpath($dir, 1, 0755) or die "Cannot mkpath $dir: $!";
        }
        my $from = join '/', ($cpan, $indices->{$index}, $index);
        unless (is_success(getstore($from, $file))) {
            die "Cannot retrieve $file from $from"; 
        }
    }
    return 1;
}

sub fetch_info {
    my $self = shift;
    my $CPAN = $self->{CPAN};
    my $info = CPAN::Search::Lite::Info->new(CPAN => $CPAN,
                                            ignore => $self->{ignore});
    $info->fetch_info() or return;

    my @tables = qw(dists mods auths);
    my $index;
    foreach my $table(@tables) {
        my $class = __PACKAGE__ . '::' . $table;
        my $this = {info => $info->{$table}};
        $index->{$table} = bless $this, $class;
    }

    unless ($self->{no_ppm}) {
        my $ppm = CPAN::Search::Lite::PPM->new(dists => $info->{dists});
        $ppm->fetch_info() or return;
        my $table = 'ppms';
        my $class = __PACKAGE__ . '::' . $table;
        my $this = {info => $ppm->{$table}};
        $index->{$table} = bless $this, $class;
    }
    $self->{index} = $index;
    return 1;
}

sub extract {
    my $self = shift;
    my %wanted = map {$_ => $self->{$_}}
        qw(CPAN state index pod_root html_root css up_img setup 
           split_pod pod_only);
    my $obj = CPAN::Search::Lite::Extract->new(%wanted);
    $obj->extract() or return;
    return 1;
}

sub state {
    my $self = shift;
    my %wanted = map {$_ => $self->{$_}} 
        qw(db user passwd index setup no_ppm reindex);
    my $state = CPAN::Search::Lite::State->new(%wanted);
    $state->state(%wanted) or return;
    $self->{state} = $state;
    return 1;
}

sub populate {
    my $self = shift;
    my %wanted = map {$_ => $self->{$_}} 
        qw(db user passwd index setup no_ppm state no_cat
           cat_threshold html_root no_mirror pod_root);
    my $db = CPAN::Search::Lite::Populate->new(%wanted);
    $db->populate() or return;
    return 1;
}

sub write_mirror_data {
    my (%args) = @_;
    my $CPAN = $args{CPAN};
    my $tt2 = $args{tt2};
    my $geoip = $args{geoip};
    my $results = mirror_list(%args);
 
    my $master = {host => 'www.cpan.org',
                  location => 'Master',
                  http => 'http://www.cpan.org',
              };    
    unshift @$results, $master;

    if (my $redirect = $args{multiplex}) {
        (my $host = $redirect) =~ s!(http|ftp)://!!; 
        my $multiplex = {host => $host,
                         location => 'Multiplexer',
                         http => $redirect,
                     };
        unshift @$results, $multiplex;
    }

    open(my $fh, '>', catfile $tt2, 'mirror_list')
        or die "Could not open $tt2/mirror_list: $!";
    print $fh '[%  mirror_list = [' . "\n";
    foreach my $result(@$results) {
        print $fh '   { host => '.qq{'$result->{host}',};
        (my $location = $result->{location}) =~ s!\'!!g;
        print $fh ' location => '.qq{'$location',};
        foreach my $protocol (qw(http ftp)) {
            next unless $result->{$protocol};
            print $fh '  '.$protocol.' => '.qq{'$result->{$protocol}',};
        }
        print $fh ' }'."\n",
    }
    print $fh '  ]' . "\n" . '%]';
    close $fh;
    return(1) unless $geoip;
    open($fh, '>', $geoip) or die "Cannot open $geoip: $!";
    foreach my $result(@$results) {
        foreach my $protocol (qw(http ftp)) {
            next unless ($result->{$protocol} and $result->{country});
            print $fh $result->{$protocol} . "\t" . $result->{country} . "\n";
        }
    }
    close $fh;
    return 1;
}

sub mirror_list {
    my (%args) = @_;
    my $CPAN = $args{CPAN};
    my $geoip = $args{geoip};
    my $mirror = catfile $CPAN, 'MIRRORED.BY';
    open (my $fh, $mirror) or die "Cannot open $mirror: $!";
    my ($hosts, $host);
    my $ignore = qr/^\#|^\s+$/;
    my $location = qr/^(\w[^:]+):\s*$/;
    my $dst_wanted = qr{^\s+dst_(ftp|http|location)\s+=\s+\"([^\"]+)};
    while (<$fh>) {
        next if /$ignore/;
        if (/$location/) {
            $host = $1;
            next;
        }
        if (/$dst_wanted/) {
            my $key = $1;
            my $value = $2;
            my $country;
            if ($key eq 'http' or $key eq 'ftp') {
                $value =~ s!/$!!;
            }
            else {
                $value =~ s/\s*\([^\)]+\)\s*//;
                my @locs = split /,\s*/, $value;
                $value = join ', ', reverse(@locs);
                if ($geoip) {
                    my $code = country2code($locs[$#locs-1]);
                    $hosts->{$host}->{country} = $code || '';
                }
            }
            $hosts->{$host}->{$key} = $value;
        }
    }
    close $fh;
    my $results;
    for (sort {$hosts->{$a}->{location} cmp $hosts->{$b}->{location}} keys %$hosts) {
        push @$results, {host => $_, location => $hosts->{$_}->{location},
                         http => $hosts->{$_}->{http},
                         ftp => $hosts->{$_}->{ftp},
                         country => $hosts->{$_}->{country},
                        };
    }
    return $results;
}

sub error_fh {
    my $file = shift;
    open(my $tmp, '>', $file) or die "Cannot open $file: $!";
    close $tmp;
    open(my $oldout, '>&STDOUT');
    open(STDOUT, '>', $file) or die "Cannot tie STDOUT to $file: $!";
    select STDOUT; $| = 1;
    return $oldout;
}

sub DESTROY {
    close STDOUT;
    open(STDOUT, '>&', $oldout);
}

1;

__END__


CPAN-Search-Lite documentation