#!/usr/bin/perl

# manitou-mgr
# Copyright (C) 2004-2009 Daniel Vrit

# This file is part of Manitou-Mail (see http://www.manitou-mail.org)
# v0.9.12

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA.

#####################################################################
# manitou-mgr
# Manitou-Mail manager (toolbox)
#####################################################################

use strict;

use DBI;
use DBD::Pg qw(:pg_types);
use IO::Handle;
use Getopt::Long;
use Encode;
use Digest::SHA1;
use Data::Dumper;

use Manitou::Words qw(load_stopwords index_words flush_word_vectors clear_word_vectors search);
use Manitou::Config qw(readconf getconf);
use Manitou::Encoding qw(encode_dbtxt decode_dbtxt);
use Manitou::Schema;

my $VERSION="0.9.12";

my $dbh;
my $cnx_string;

my $conf_file;
my $opt_quiet;
my $opt_action;
my $opt_search_terms;
my $commit_step=100;
my $vacuum_step=1000;
my ($min_mail_id, $max_mail_id);

sub usage {
  print STDERR qq{Usage: $0 [--conf=config_file] [--quiet] --action={reindex-words,hash-attachments,merge-attachments,print-size,iwi-query}
 --action=reindex_words [--start=start mail_id] [--end=end mail_id] [--commit=# of msgs in between db commits] [--vacuum=# of commits in between db vacuum ops]
 --action=iwi-query --search='search terms'
};
  exit 1;
}

STDOUT->autoflush(1);

my $rc = GetOptions("conf:s" => \$conf_file,
		    "action:s" => \$opt_action,
		    "start:i" => \$min_mail_id,
		    "end:i" => \$max_mail_id,
		    "commit:i" => \$commit_step,
		    "vacuum:i" => \$vacuum_step,
		    "search:s" => \$opt_search_terms,
		    "quiet" => \$opt_quiet
		   );

if (!$rc || !$opt_action) {
  usage();
}

if (!defined $conf_file && -r "/etc/manitou-mdx.conf") {
  $conf_file="/etc/manitou-mdx.conf"; # default config file
}
if (defined $conf_file) {
  my %err;
  if (!readconf($conf_file, \%err)) {
    print STDERR "Error in config file: ", $err{msg}, "\n";
    exit 1;
  }
}

Connect() unless $opt_action eq "create-database";

if ($opt_action eq "reindex_words") {
  reindex_words();
}
elsif ($opt_action eq "hash-attachments") {
  hash_attachments();
}
elsif ($opt_action eq "merge-attachments") {
  merge_attachments();
}
elsif ($opt_action eq "print-size") {
  print_size();
}
elsif ($opt_action eq "create-schema") {
  create_schema();
}
elsif ($opt_action eq "create-database") {
  create_database();
}
elsif ($opt_action eq "iwi-query") {
  iwi_query($opt_search_terms);
}
else {
  usage();
}
exit(0);

sub Connect {
  if (defined $ENV{'MANITOU_CONNECT_STRING'}) {
    $cnx_string=$ENV{'MANITOU_CONNECT_STRING'};
  }
  else {
    $cnx_string=getconf("db_connect_string");
  }
  if (!defined($cnx_string)) {
    die "Please define the db_connect_string parameter in the configuration file, or the MANITOU_CONNECT_STRING environment variable.";
  }
  $dbh = DBI->connect($cnx_string) or die "Can't connect: $DBI::errstr";
  $dbh->{PrintError}=0;
  $dbh->{RaiseError}=1;
  $dbh->{pg_auto_escape}=1;
  $dbh->{AutoCommit}=1;
  Manitou::Encoding::get_db_encoding($dbh);
}

sub create_schema_part {
  foreach (@_) {
    if (!$dbh->do($_)) {
      $dbh->rollback;
      die "Statement failed: $dbh->errstr\n";
    }
  }
}

sub create_database {
  my $dbh1=DBI->connect("dbi:Pg:user=postgres") or die;
  $dbh1->{AutoCommit}=1;
  $dbh1->{RaiseError}=0;
  my $dbname="manitou";
  my $dbuser="manitou";
  $dbh1->do("CREATE USER manitou") or die $dbh1->errstr;
  $dbh1->do("CREATE DATABASE $dbname OWNER $dbuser") or die;
  $dbh=DBI->connect("dbi:Pg:user=$dbuser dbname=$dbname") or die DBI->errstr;
  $dbh->do("CREATE LANGUAGE plpgsql") or die $dbh->errstr;
  create_schema();
}

sub create_schema {
  $dbh->begin_work;
  $dbh->do("SET client_min_messages=warning");
  my @tables=Manitou::Schema::create_table_statements();
  print "Creating tables\n" unless ($opt_quiet);
  create_schema_part(@tables);

  my @sequences=Manitou::Schema::create_sequence_statements();
  print "Creating sequences\n" unless ($opt_quiet);
  create_schema_part(@sequences);

  my @functions=Manitou::Schema::create_function_statements();
  print "Creating functions\n" unless ($opt_quiet);
  create_schema_part(@functions);

  my @triggers=Manitou::Schema::create_trigger_statements();
  print "Creating triggers\n" unless ($opt_quiet);
  create_schema_part(@triggers);

  my $version=Manitou::Schema::current_version();
  $dbh->do("INSERT INTO runtime_info(rt_key,rt_value) VALUES ('schema_version','$version')");

  $dbh->commit;
}

sub reindex_words {
  # $dbh2 is used for fetching mail_id from a cursor, inside a transaction
  # that ends only at the end of session (no commit, hence the need for
  # this second database connection)
  my $dbh2 = DBI->connect($cnx_string) or die "Can't connect: $DBI::errstr";
  $dbh2->{AutoCommit}=0;

  load_stopwords($dbh);

  my $where;
  if ($min_mail_id) {
    if ($max_mail_id) {
      $where="WHERE mail_id BETWEEN $min_mail_id AND $max_mail_id";
    } else {
      $where="WHERE mail_id>=$min_mail_id";
    }
  } elsif ($max_mail_id) {
    $where="WHERE mail_id<=$max_mail_id";
  }

  my $sc=$dbh->prepare("SELECT count(*) FROM mail $where");
  $sc->execute;
  my ($total)=$sc->fetchrow_array;

  my $commits=0;
  $dbh2->do("DECLARE c CURSOR FOR SELECT mail_id FROM mail $where ORDER BY mail_id");
  my $s=$dbh2->prepare("FETCH $commit_step FROM c");
  $s->execute;
  my $count=0;
  while ($s->rows>0) {
    my $sthb=$dbh->prepare("SELECT bodytext FROM body where mail_id=?");
    my $sthh=$dbh->prepare("SELECT lines FROM header where mail_id=?");

    my $mail_id;
    $dbh->begin_work;
    while (($mail_id)=$s->fetchrow_array) {
      $count++;
      $sthb->execute($mail_id);
      my ($body)=$sthb->fetchrow_array;
      $sthh->execute($mail_id);
      my ($header)=$sthh->fetchrow_array;
      $body = decode_dbtxt($body);
      $header = decode_dbtxt($header);
      $body .= Manitou::Words::header_contents_to_ftidx($header);
      index_words($dbh, $mail_id, \$body, \$header);
    }

    print "Flushing word vectors..." unless ($opt_quiet);
    flush_word_vectors($dbh);
    clear_word_vectors();
    $dbh->commit;
    print "done ($count/$total)\n" unless ($opt_quiet);
    $commits++;

    if ($commits % $vacuum_step==0) {
      print "Vacuuming..." unless ($opt_quiet);
      $dbh->do("VACUUM ANALYZE inverted_word_index");
      print "done\n" unless ($opt_quiet);
    }
    $s->execute;
  }

  $dbh->begin_work;
  print "Flushing word vectors..." unless ($opt_quiet);
  flush_word_vectors($dbh);
  $dbh->commit;
  print "done\n" unless ($opt_quiet);

  $dbh->do("VACUUM ANALYZE inverted_word_index");
  $dbh2->do("CLOSE c");
  $dbh2->commit;
}

sub hash_attachments {
  my $su=$dbh->prepare("UPDATE attachment_contents SET fingerprint=? WHERE attachment_id=?");
  my $sth=$dbh->prepare("SELECT attachment_id, content FROM attachment_contents WHERE fingerprint IS NULL LIMIT 1000");
  do {
    $dbh->begin_work;
    $sth->execute;
    my $sha1 = Digest::SHA1->new;
    while (my @r=$sth->fetchrow_array) {
      $sha1->reset;
      my $lobj_fd = $dbh->func ($r[1], $dbh->{pg_INV_READ}, 'lo_open');
      die $dbh->errstr if (!defined($lobj_fd));
      my $buf;
      my $nbytes;
      do {
	$nbytes = $dbh->func($lobj_fd, $buf, 16384, 'lo_read');
	$sha1->add($buf);
	die $dbh->errstr if (!defined($nbytes));
      } while ($nbytes==16384);
      $dbh->func ($lobj_fd, 'lo_close');
      my $b64=$sha1->b64digest;
      $su->execute($b64, $r[0]);
      printf("Updating attch_id=%d with hash '%s'\n", $r[0], $b64) unless ($opt_quiet);
    }
    $dbh->commit;
  } while ($sth->rows>0)
}

sub merge_attachments {
  my $sth=$dbh->prepare("SELECT fingerprint,MIN(content) FROM attachment_contents WHERE fingerprint IS NOT NULL GROUP BY fingerprint HAVING count(*)>1");
  my $sth1=$dbh->prepare("SELECT attachment_id, content FROM attachment_contents WHERE fingerprint=? AND content<>?");
  my $sthu=$dbh->prepare("UPDATE attachment_contents SET content=? WHERE attachment_id=?");
  $sth->execute;
  my %removed; # references to removed LOs
  $dbh->begin_work;
  while (my ($fingerprint,$keep_oid)=$sth->fetchrow_array) {
    $sth1->execute($fingerprint, $keep_oid);
    while (my ($id,$oid)=$sth1->fetchrow_array) {
      if (!exists $removed{$oid}) {
	if (!$dbh->func($oid, 'lo_unlink')) {
	  print "Warning: failed to remove large object $oid (attachment_id=$id)\n";
	}
	else {
	  print "Removed large object $oid\n" unless ($opt_quiet);
	  $removed{$oid}=1;
	}
      }
      $sthu->execute($keep_oid, $id);
    }
  }
  $dbh->commit;
}

sub print_size {
  my @tables=
    (
     "addresses",
     "attachment_contents",
     "attachments",
     "body",
     "header",
     "inverted_word_index",
     "mail",
     "mail_addresses",
     "mail_tags",
     "pg_largeobject",
     "raw_mail",
     "words");

  print "-"x35, "\n";
  my $sth1=$dbh->prepare("SELECT pg_total_relation_size(?)");
  foreach (@tables) {
    $sth1->execute($_);
    my ($sz)=$sth1->fetchrow_array;
    printf("%-20s: %8.2f MB\n", $_, $sz/(1024*1024));
  }

  print "-"x35, "\n";
  my $sth=$dbh->prepare("SELECT pg_size_pretty(pg_database_size(current_database()))");
  $sth->execute;
  my ($szb)=$sth->fetchrow_array;
  printf("Total database size : %s\n", $szb);
}

sub iwi_query {
  my $terms = shift;
  die "A non-empty search query must be specified" if (length($terms)==0);
  my @results = Manitou::Words::search($dbh, $terms);
  print join(",", @results), "\n";
}

__END__

=head1 NAME

 manitou-mgr - A management toolbox for Manitou-Mail

=head1 SYNOPSIS

 manitou-mgr [--conf=/path/to/configfile] [--quiet]
   --action=reindex-words [--start=start mail_id] [--end=end mail_id]
                          [--commit=# of msgs in between db commits]
                          [--vacuum=# of commits in between db vacuum ops]
   --action={hash-attachments,merge-attachments}
   --action=print-size


=head1 DESCRIPTION

=over

=item B<hash-attachments>:
 compute the fingerprints of the attachments in the database that have no fingerprint. This is done automatically by manitou-mdx during normal import.

=item B<merge-attachments>:
 merge attachments that share the same fingerprint. This is done automatically by manitou-mdx during normal import.

=item B<print-size>:
 print the sizes on disk of the main database tables.

=item B<create-database>:
 create a new, empty PostgreSQL database to hold the mail data.

=item B<create-schema>:
 create all database objects to hold the mail data (should be applied to an empty database).

=back

=cut
