#Perl convertor MDL .mol to CML - Version 1.4 - schema version

#!/usr/local/bin/perl 

use DBI;

###########################
##   Read in the input   ##
###########################

# make sure input file is defined

# get the arguments and put them in variables

	
if( not defined($ARGV[0]) )
{ 
	die( "No file name supplied\n");
}
else
{
	$file = $ARGV[0];
}

if( not defined($ARGV[1]) )
{
	$mol_name = $file;
	print "molecule name set to filename ($file)\n";
}
else
{
	$mol_name = $ARGV[1];	
}

# locate the file and define the output file

$INPUT_FILE = "@ARGV[0]" . ".mol";

# make sure input file is present

if (!-e $INPUT_FILE) 
{
	die( "The file $INPUT_FILE doesn't exist." );
}

my $dsn = "DBI:mysql:compound:localhost";	# data source
my $user_name = "chem";						# user name
my $password = "chem";						# password

# connect to database
my $dbh = DBI->connect( $dsn, $user_name, $password,
						{ RaiseError => 1, PrintError => 0 } );

# read the input file into @array

open(INPUT_FILE);
@array = <INPUT_FILE>;
close(INPUT_FILE);

foreach (@array) {

$wholefile = $wholefile.$_;

}


# entire input file is now contained 
# in scalar variable $wholefile

    chomp($wholefile);
    chomp($wholefile);
    chomp($wholefile);


	# - count number of atoms	
	@no_of_atoms=$wholefile =~ m!(?=\n)\s+[-\d\.]+\s+[-\d\.]+\s+[-\d\.]+\s+[A-Za-z]+[^\n]*(?=\n)!gmx;

	$no_of_atoms=@no_of_atoms;

	# - count number of bonds	
	$wholefile =~ s/\s+([\d]+\s+)+V2000//;

	@no_of_bonds=$wholefile =~ m!(?=\n)\s+[\d]+\s+[\d]+\s+[\d]+[^\n]*(?=\n||$)!gmx;

	$no_of_bonds=@no_of_bonds;
	
	# issue query
	$rows = $dbh->do( qq{
				INSERT INTO molecule (mol_id, num_atoms, num_bonds, CAS) 
				VALUES( NULL, $no_of_atoms, $no_of_bonds, 'new' ) } );

	# issue query
	my $sth = $dbh->prepare("SELECT mol_id FROM molecule WHERE CAS=\'new\'");
	$sth->execute();

	my @ary = $sth->fetchrow_array();
	$mol_id=$ary[0]; 

	$rows = $dbh->do( qq{ UPDATE molecule SET CAS='' WHERE cas='new'});

   	for ($i=1;$i<=$no_of_atoms;$i++)
	{
    	if( $wholefile =~ m!(?=\n)\s+([-\d\.]+)\s+([-\d\.]+)\s+([-\d\.]+)\s+([A-Za-z]+)[^\n]*(?=\n)!gmx )
		{
			my $x = $1;
			my $y = $2;
			my $z = $3;
			my $atom = $4;
			print "x: $x, y: $y, z: $z, atom: $atom\n";
			$sql= qq{
				INSERT INTO atom (atom_id, mol_id, sequence, x, y, z, atom_type) 
				VALUES( NULL, $mol_id, $i, $x, $y, $z, \'$atom\' ) };
			$rows = $dbh->do( $sql );
		}
	}

	# Markup bond data
	for ($i=1;$i<=$no_of_bonds;$i++)
	{
		if( $wholefile =~ m!(?=\n)\s+([\d]+)\s+([\d]+)\s+([\d]+)[^\n]*(?=(\n||$))!gmx )
		{
			my $atom_1 = $1;
			my $atom_2 = $2;
			my $type = $3;
			print "atom_1: $atom_1, atom_2: $atom_2, type: $type\n";
			$rows = $dbh->do( qq{
				INSERT INTO bond (bond_id, mol_id, atom_1, atom_2, bond_type) 
				VALUES( NULL, $mol_id, $atom_1, $atom_2, \'$type\' ) } );
		}
	}

	$rows = $dbh->do( qq{
		INSERT INTO name (name_id, mol_id, name )
		VALUES( NULL, $mol_id, \'$mol_name\' ) } );
	
$dbh->disconnect();
exit(0);
