#!/usr/bin/perl

eval 'exec /usr/bin/perl  -S $0 ${1+"$@"}'
    if 0; # not running under some shell

# manitou-mgr
# Copyright (C) 2004-2007 Daniel Vrit

# This file is part of Manitou-Mail (see http://www.manitou-mail.org)
# v0.9.9

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA.

#####################################################################
# manitou-mgr
# Manitou-Mail manager (toolbox)
#####################################################################

use strict;

use DBI;
use DBD::Pg qw(:pg_types);
use IO::Handle;
use Getopt::Long;
use Encode;
use Digest::SHA1;

use Manitou::Words qw(load_stopwords index_words flush_word_vectors clear_word_vectors);
use Manitou::Config qw(readconf getconf);
use Manitou::Encoding qw(encode_dbtxt decode_dbtxt);

my $dbh;
my $cnx_string;

my $conf_file;
my $opt_quiet;
my $opt_action;
my $commit_step=100;
my $vacuum_step=1000;
my ($min_mail_id, $max_mail_id);

sub usage {
  print STDERR qq{Usage: $0 [--conf=config_file] [--quiet] --action={reindex-words,hash-attachments,merge-attachments}
 --action=reindex_words [--start=start mail_id] [--end=end mail_id] [--commit=# of msgs in between db commits] [--vacuum=# of commits in between db vacuum ops]
};
  exit 1;
}

STDOUT->autoflush(1);

my $rc = GetOptions("conf:s" => \$conf_file,
		    "action:s" => \$opt_action,
		    "start:i" => \$min_mail_id,
		    "end:i" => \$max_mail_id,
		    "commit:i" => \$commit_step,
		    "vacuum:i" => \$vacuum_step,
		    "quiet" => \$opt_quiet
		   );

if (!$rc || !$opt_action) {
  usage();
}

if (!defined $conf_file && -r "/etc/manitou-mdx.conf") {
  $conf_file="/etc/manitou-mdx.conf"; # default config file
}
if (defined $conf_file) {
  my %err;
  if (!readconf($conf_file, \%err)) {
    print STDERR "Error in config file: ", $err{msg}, "\n";
    exit 1;
  }
}

Connect();
if ($opt_action eq "reindex_words") {
  reindex_words();
}
elsif ($opt_action eq "hash-attachments") {
  hash_attachments();
}
elsif ($opt_action eq "merge-attachments") {
  merge_attachments();
}
else {
  usage();
}
exit(0);

sub Connect {
  $cnx_string=getconf("db_connect_string");
  if (!defined($cnx_string)) {
    $cnx_string=$ENV{'MANITOU_CONNECT_STRING'};
  }
  if (!defined($cnx_string)) {
    die "Please define the db_connect_string parameter in the configuration file, or the MANITOU_CONNECT_STRING environment variable.";
  }
  # $dbh is used for indexing and commits from time to time
  $dbh = DBI->connect($cnx_string) or die "Can't connect: $DBI::errstr";
  $dbh->{PrintError}=0;
  $dbh->{RaiseError}=1;
  $dbh->{pg_auto_escape}=1;
  $dbh->{AutoCommit}=1;
  Manitou::Encoding::get_db_encoding($dbh);
}


sub reindex_words {
  # $dbh2 is used for fetching mail_id from a cursor, inside a transaction
  # that ends only at the end of session (no commit, hence the need for
  # this second database connection)
  my $dbh2 = DBI->connect($cnx_string) or die "Can't connect: $DBI::errstr";
  $dbh2->{AutoCommit}=0;

  load_stopwords($dbh);

  my $where;
  if ($min_mail_id) {
    if ($max_mail_id) {
      $where="WHERE mail_id BETWEEN $min_mail_id AND $max_mail_id";
    } else {
      $where="WHERE mail_id>=$min_mail_id";
    }
  } elsif ($max_mail_id) {
    $where="WHERE mail_id<=$max_mail_id";
  }

  my $sc=$dbh->prepare("SELECT count(*) FROM mail $where");
  $sc->execute;
  my ($total)=$sc->fetchrow_array;

  my $commits=0;
  $dbh2->do("DECLARE c CURSOR FOR SELECT mail_id FROM mail $where ORDER BY mail_id");
  my $s=$dbh2->prepare("FETCH $commit_step FROM c");
  $s->execute;
  my $count=0;
  while ($s->rows>0) {
    my $sthb=$dbh->prepare("SELECT bodytext FROM body where mail_id=?");
    my $sthh=$dbh->prepare("SELECT lines FROM header where mail_id=?");

    my $mail_id;
    $dbh->begin_work;
    while (($mail_id)=$s->fetchrow_array) {
      $count++;
      $sthb->execute($mail_id);
      my ($body)=$sthb->fetchrow_array;
      $sthh->execute($mail_id);
      my ($header)=$sthh->fetchrow_array;
      $body = decode_dbtxt($body);
      $header = decode_dbtxt($header);
      $body .= Manitou::Words::header_contents_to_ftidx($header);
      index_words($dbh, $mail_id, \$body, \$header);
    }

    print "Flushing word vectors..." unless ($opt_quiet);
    flush_word_vectors($dbh);
    clear_word_vectors();
    $dbh->commit;
    print "done ($count/$total)\n" unless ($opt_quiet);
    $commits++;

    if ($commits % $vacuum_step==0) {
      print "Vacuuming..." unless ($opt_quiet);
      $dbh->do("VACUUM ANALYZE inverted_word_index");
      print "done\n" unless ($opt_quiet);
    }
    $s->execute;
  }

  $dbh->begin_work;
  print "Flushing word vectors..." unless ($opt_quiet);
  flush_word_vectors($dbh);
  $dbh->commit;
  print "done\n" unless ($opt_quiet);

  $dbh->do("VACUUM ANALYZE inverted_word_index");
  $dbh2->do("CLOSE c");
  $dbh2->commit;
}

sub hash_attachments {
  my $su=$dbh->prepare("UPDATE attachment_contents SET fingerprint=? WHERE attachment_id=?");
  my $sth=$dbh->prepare("SELECT attachment_id, content FROM attachment_contents WHERE fingerprint IS NULL LIMIT 1000");
  do {
    $dbh->begin_work;
    $sth->execute;
    my $sha1 = Digest::SHA1->new;
    while (my @r=$sth->fetchrow_array) {
      $sha1->reset;
      my $lobj_fd = $dbh->func ($r[1], $dbh->{pg_INV_READ}, 'lo_open');
      die $dbh->errstr if (!defined($lobj_fd));
      my $buf;
      my $nbytes;
      do {
	$nbytes = $dbh->func($lobj_fd, $buf, 16384, 'lo_read');
	$sha1->add($buf);
	die $dbh->errstr if (!defined($nbytes));
      } while ($nbytes==16384);
      $dbh->func ($lobj_fd, 'lo_close');
      my $b64=$sha1->b64digest;
      $su->execute($b64, $r[0]);
      printf("Updating attch_id=%d with hash '%s'\n", $r[0], $b64) unless ($opt_quiet);
    }
    $dbh->commit;
  } while ($sth->rows>0)
}

sub merge_attachments {
  my $sth=$dbh->prepare("SELECT fingerprint,MIN(content) FROM attachment_contents WHERE fingerprint IS NOT NULL GROUP BY fingerprint HAVING count(*)>1");
  my $sth1=$dbh->prepare("SELECT attachment_id, content FROM attachment_contents WHERE fingerprint=? AND content<>?");
  my $sthu=$dbh->prepare("UPDATE attachment_contents SET content=? WHERE attachment_id=?");
  $sth->execute;
  $dbh->begin_work;
  while (my ($fingerprint,$keep_oid)=$sth->fetchrow_array) {
    $sth1->execute($fingerprint, $keep_oid);
    while (my ($id,$oid)=$sth1->fetchrow_array) {
      $sthu->execute($oid,$id);
      if (!$dbh->func($oid, 'lo_unlink')) {
	print "Warning: failed to remove large object $oid\n";
      }
      else {
	print "Removed large object $oid\n";
	$sthu->execute($keep_oid, $id);
      }
    }
  }
  $dbh->commit;
}
