#/usr/local/bin/perl -w
#omim_4.pl
#
#This script was written by Jules Berman on 7/19/05
#
#The software is provided "as is", without warranty of any kind,
#express or implied, including but not limited to the warranties
#of merchantability, fitness for a particular purpose and
#noninfringement. In no event shall the authors or copyright
#holders be liable for any claim, damages or other liability,
#whether in an action of contract, tort or otherwise, arising
#from, out of or in connection with the software or the use or
#other dealings in the software.
#
#As a courtesy, users of this script should cite the following
#publication, which describes the algorithm used in this is script:
#
#Berman JJ. Perl Programming for Medicine and Biology.
#Jones and Bartlett, Sudbury, MA, pp 311-324, 2007.
#
#The purpose of this script is to collect all the omim records that
#contain include the names of tumors and to determine whether the
#tumors violate lineage.
#
#
open (TEXT,"neocl.xml")||die"Cannot";
my $line = " ";
my $count = 1;
my (%valuecount, %literalhash, %doubhash);
while ($line ne "")
{
my ($doublet, $phrase, $code, $i);
$line = <TEXT>;
next if $line =~ /\"(S[0-9]{7})\"/;
$line =~ /\"(C[0-9]{7})\"/;
$code = $1;
next if ($code =~ /C000000/); #gets rid of unclassified and precancers
$line =~ /\"\> ?(.+) ?\<\//;
$phrase = $1;
$phrase =~ s/\'s//g;
$phrase =~ s/\,/ /g;
$phrase =~ s/[^a-z0-9 \-]/ /g;
$phrase =~ s/\b([a-z]+oma)s/$1/g;
$phrase =~ s/\b(tumo[u]?r)s/$1/g;
$phrase =~ s/\b(neoplasm)s/$1/g;
$phrase =~ s/\b(leuk[a]?emia)s/$1/g;
$phrase =~ s/ +/ /g;
if ($phrase !~ / /)
{
$literalhash{$phrase} = $code;
next;
}
$literalhash{$phrase} = $code;
@hoparray = split(/ /,$phrase);
for ($i=0;$i<(scalar(@hoparray)-1);$i++)
{
$doublet = "$hoparray[$i] $hoparray[$i+1]";
next unless ($doublet =~ /[a-z]+[a-z0-9\-]* [a-z]+[a-z0-9\-]*/);
$doubhash{$doublet}="";
}
}
close TEXT;
open (TEXT, neoself)||die"Can't open file";
$line = " ";
%neoself;
my @types = ("primitive", "endoderm_or_ectoderm", "mesoderm", "neuroectoderm");
while ($line ne "")
{
$line = <TEXT>;
next if $line !~ /^[0-9]+\|/;
my @linearray = split(/\|/,$line);
foreach $thing (@types)
{
next if ($linearray[3] !~ /$thing/);
$neoself{$linearray[2]} = $thing;
}
}
close TEXT;
$start = time();
open (OUT, ">omim.xml")||die"Can't open file";
open (BAD, ">omim.bad")||die"Can't open file";
open (TWO, ">omim.two")||die"Can't open file";
print OUT "\<\?xml version\=\"1\.0\"\?\>\n";
print OUT "\\n";
$/ = "*RECORD*";
open (TEXT, "omim")||die"Can't open file";
$line = <TEXT>;
$line = " ";
$hold = "";
$recordcount = 0;
my $tumorrecordcount = 0;
while ($line ne "")
{
$hold = "";
my (%subhash, $key, $code, $frontline, $endline, $smallfrontline, $smallendline);
$line = ;
$recordcount++;
if ($line =~ /\*FIELD\* NO\n([0-9]+)\n\*FIELD\* TI\n([^\*]+)\*FIELD\* TX/m)
{
$hold = " \\n";
$title = $2;
$title = lc($title);
$title =~ s/\n/ /g;
}
else #if the record doesn't have a title, we're not interested in it
{
next;
}
$line = lc($line);
$line =~ s/\'[s]?//g;
$line =~ s/\b([a-z]+oma)s/$1/g;
$line =~ s/\b(tumo[u]?r)s/$1/g;
$line =~ s/\b(neoplasm)s/$1/g;
$line =~ s/\b(leuk[a]?emia)s/$1/g;
$line =~ s/[^a-z0-9 \-\:\.\n]/ /g;
if ($line ne "")
{
my ($i, $doubline);
$line =~ s/\.$//o;
$line =~ s/\n/ /g;
$line =~ s/ +/ /g;
$line =~ s/^ +//o;
$line =~ s/ +$//o;
$hold = $hold . " \";
$grepline = $title;
if (length($title)>50)
{
@eighty_line=grep{length()>0}split(m"(.{1,60})\b",$grepline);
foreach my $value (@eighty_line)
{
$value =~ s/^ +//o;
$hold = $hold . " $value\n";
}
$hold =~ s/\<dc\:title\> +/\<<dc\:title\>/o;
}
else
{
$hold = $hold . "$title\n";
}
$hold =~ s/\n$//o;
$hold = $hold . "\<\/dc\:title\>\n";
#$line =~ s/[^a-z 0-9\-]/ /g;
$line = "zzzzz $line zzzzz zzzzz";
#$line =~ s/ +/ /g;
@hoparray = split(/ /,$line);
for ($i=0;$i<(scalar(@hoparray)-1);$i++)
{
if (exists $literalhash{$hoparray[$i]})
{
$subhash{$hoparray[$i]}=$literalhash{$hoparray[$i]};
}
$doublet = "$hoparray[$i] $hoparray[$i+1]";
if (exists $literalhash{$doublet})
{
$subhash{$doublet}=$literalhash{$doublet};
}
if (exists $doubhash{$doublet})
{
if ($doubline ne "")
{
$doubline = $doubline . " $hoparray[$i+1]";
if (exists $literalhash{$doubline})
{
$subhash{$doubline}=$literalhash{$doubline};
}
if ($doubline =~ /^[a-z0-9\-]+ /)
{
$endline = $';
if (exists $literalhash{$endline})
{
$subhash{$endline}=$literalhash{$endline};
}
}
if ($doubline =~ / [a-z0-9\-]+$/)
{
$frontline = $`;
if (exists $literalhash{$frontline})
{
$subhash{$frontline}=$literalhash{$frontline};
}
}
if ($doubline =~ /^[a-z0-9\-]+ (.+) [a-z0-9\-]+$/)
{
$midline = $1;
if (exists $literalhash{$midline})
{
$subhash{$midline}=$literalhash{$midline};
}
}
}
else
{
$doubline = $doublet;
}
}
else
{
if ($doubline ne "")
{
if (exists $literalhash{$doubline})
{
$subhash{$doubline}=$literalhash{$doubline};
}
if ($doubline =~ / [a-z0-9\-]+$/)
{
$frontline = $`;
if (exists $literalhash{$frontline})
{
$subhash{$frontline}=$literalhash{$frontline};
}
}
if ($doubline =~ / [a-z0-9\-]+ [a-z0-9\-]+$/)
{
$smallfrontline = $`;
if (exists $literalhash{$smallfrontline})
{
$subhash{$smallfrontline}=$literalhash{$smallfrontline};
}
}
if ($doubline =~ /^[a-z0-9\-]+ /)
{
$endline = $';
if (exists $literalhash{$endline})
{
$subhash{$endline}=$literalhash{$endline};
}
}
if ($doubline =~ /^[a-z0-9\-]+ [a-z0-9\-]+ /)
{
$smallendline = $';
if (exists $literalhash{$smallendline})
{
$subhash{$smallendline}=$literalhash{$smallendline};
}
}
if ($doubline =~ /^[a-z0-9\-]+ (.+) [a-z0-9\-]+$/)
{
$midline = $1;
if (exists $literalhash{$midline})
{
$subhash{$midline}=$literalhash{$midline};
}
}
$doubline = "";
}
}
}
if (scalar(keys(%subhash)) == 0)
{
undef %subhash;
next;
}
$tumorrecordcount++;
print OUT "$tumorrecordcount\: $hold";
my %smallvaluecount;
while ((my $key, my $value) = each(%subhash))
{
print OUT " \\n";
if (exists $neoself{$value})
{
my $type = $neoself{$value};
$valuecount{$type}++;
$smallvaluecount{$type}++;
}
}
if (scalar(keys(%smallvaluecount)) == 2)
{
if (exists $smallvaluecount{"primitive"})
{
undef %smallvaluecount;
next;
}
print BAD "\n";
print BAD $hold;
foreach $letter (sort (keys(%smallvaluecount)))
{
print TWO "$letter ";
}
print TWO "\n";
while ((my $key, my $value) = each(%subhash))
{
print BAD " $key $value $neoself{$value}\n";
}
while (($key, $value) = each(%smallvaluecount))
{
print BAD " $key => $value\n";
}
}
undef %smallvaluecount;
undef %subhash;
print OUT" \<\/rdf\:Description\>\n";
}
}
print OUT "\<\/rdf\:RDF\>";
close TEXT;
close OUT;
$end = time();
$totaltime = $end - $start;
print "\nThe time to code was $totaltime seconds\n";
print "The number of records in omim is $recordcount\n";
print "The number of tumorous records in omim is $tumorrecordcount\n";
print "The number of primitive tumors listed in omim is " . $valuecount{"primitive"} . "\n";
print "The number of endoderm_or_ectoderm tumors listed in omim is " . $valuecount{"endoderm_or_ectoderm"} . "\n";
print "The number of mesoderm tumors listed in omim is " . $valuecount{"mesoderm"} . "\n";
print "The number of neuroectoderm tumors listed in omim is " . $valuecount{"neuroectoderm"} . "\n";
exit;
Last modified: January 6, 2008