Biomedical books by Jules J. Berman, Ph.D., M.D.


biomedical informatics cover Perl Programming for Medicine and Biology Cover Ruby for Medicine and Biology Cover Ruby: The Programming Language


















#/usr/local/bin/perl -w
#omim_4.pl
#
#This script was written by Jules Berman on 7/19/05
#
#The software is provided "as is", without warranty of any kind,
#express or implied, including but not limited to the warranties
#of merchantability, fitness for a particular purpose and
#noninfringement. In no event shall the authors or copyright
#holders be liable for any claim, damages or other liability,
#whether in an action of contract, tort or otherwise, arising
#from, out of or in connection with the software or the use or
#other dealings in the software.
#
#As a courtesy, users of this script should cite the following
#publication, which describes the algorithm used in this is script:
#
#Berman JJ. Perl Programming for Medicine and Biology.
#Jones and Bartlett, Sudbury, MA, pp 311-324, 2007.
#
#The purpose of this script is to collect all the omim records that
#contain include the names of tumors and to determine whether the
#tumors violate lineage.
#
#
open (TEXT,"neocl.xml")||die"Cannot";
my $line = " ";
my $count = 1;
my (%valuecount, %literalhash, %doubhash);
while ($line ne "")
  {
  my ($doublet, $phrase, $code, $i);
  $line = <TEXT>;
  next if $line =~ /\"(S[0-9]{7})\"/;
  $line =~ /\"(C[0-9]{7})\"/;
  $code = $1;
  next if ($code =~ /C000000/); #gets rid of unclassified and precancers
  $line =~ /\"\> ?(.+) ?\<\//;
  $phrase = $1;
  $phrase =~ s/\'s//g;
  $phrase =~ s/\,/ /g;
  $phrase =~ s/[^a-z0-9 \-]/ /g;
  $phrase =~ s/\b([a-z]+oma)s/$1/g;
  $phrase =~ s/\b(tumo[u]?r)s/$1/g;
  $phrase =~ s/\b(neoplasm)s/$1/g;
  $phrase =~ s/\b(leuk[a]?emia)s/$1/g;
  $phrase =~ s/ +/ /g;
  if ($phrase !~ / /)
     {
     $literalhash{$phrase} = $code;
     next;
     }
  $literalhash{$phrase} = $code;
  @hoparray = split(/ /,$phrase);
  for ($i=0;$i<(scalar(@hoparray)-1);$i++)
      {
      $doublet = "$hoparray[$i] $hoparray[$i+1]";
      next unless ($doublet =~ /[a-z]+[a-z0-9\-]* [a-z]+[a-z0-9\-]*/);
      $doubhash{$doublet}="";
      }
  }
close TEXT;
open (TEXT, neoself)||die"Can't open file";
$line = " ";
%neoself;
my @types = ("primitive", "endoderm_or_ectoderm", "mesoderm", "neuroectoderm");
while ($line ne "")
  {
  $line = <TEXT>;
  next if $line !~ /^[0-9]+\|/;
  my @linearray = split(/\|/,$line);
  foreach $thing (@types)
    {
    next if ($linearray[3] !~ /$thing/);
    $neoself{$linearray[2]} = $thing;
    }
  }
close TEXT;
$start = time();
open (OUT, ">omim.xml")||die"Can't open file";
open (BAD, ">omim.bad")||die"Can't open file";
open (TWO, ">omim.two")||die"Can't open file";
print OUT "\<\?xml version\=\"1\.0\"\?\>\n";
print OUT "\\n";
$/ = "*RECORD*";
open (TEXT, "omim")||die"Can't open file";
$line = <TEXT>;
$line = " ";
$hold = "";
$recordcount = 0;
my $tumorrecordcount = 0;
while ($line ne "")
   {
   $hold = "";
   my (%subhash, $key, $code, $frontline, $endline, $smallfrontline, $smallendline);
   $line = ;
   $recordcount++;
   if ($line =~ /\*FIELD\* NO\n([0-9]+)\n\*FIELD\* TI\n([^\*]+)\*FIELD\* TX/m)
     {
     $hold = "    \\n";
     $title = $2;
     $title = lc($title);
     $title =~ s/\n/ /g;
     }
   else    #if the record doesn't have a title, we're not interested in it
     {
     next;
     }
   $line = lc($line);
   $line =~ s/\'[s]?//g;
   $line =~ s/\b([a-z]+oma)s/$1/g;
   $line =~ s/\b(tumo[u]?r)s/$1/g;
   $line =~ s/\b(neoplasm)s/$1/g;
   $line =~ s/\b(leuk[a]?emia)s/$1/g;
   $line =~ s/[^a-z0-9 \-\:\.\n]/ /g;
   if ($line ne "")
     {
     my ($i, $doubline);
     $line =~ s/\.$//o;
     $line =~ s/\n/ /g;
     $line =~ s/ +/ /g;
     $line =~ s/^ +//o;
     $line =~ s/ +$//o;
     $hold = $hold . "        \";
     $grepline = $title;
     if (length($title)>50)
          {
          @eighty_line=grep{length()>0}split(m"(.{1,60})\b",$grepline);
          foreach my $value (@eighty_line)
             {
             $value =~ s/^ +//o;
             $hold = $hold . "        $value\n";
             }
          $hold =~ s/\<dc\:title\> +/\<<dc\:title\>/o;
          }
    else
          {
          $hold = $hold . "$title\n";
          }
     $hold =~ s/\n$//o;
     $hold = $hold . "\<\/dc\:title\>\n";
     #$line =~ s/[^a-z 0-9\-]/ /g;
     $line = "zzzzz $line zzzzz zzzzz";
     #$line =~ s/ +/ /g;
     @hoparray = split(/ /,$line);
     for ($i=0;$i<(scalar(@hoparray)-1);$i++)
        {
        if (exists $literalhash{$hoparray[$i]})
           {
           $subhash{$hoparray[$i]}=$literalhash{$hoparray[$i]};
           }
        $doublet = "$hoparray[$i] $hoparray[$i+1]";
        if (exists $literalhash{$doublet})
           {
           $subhash{$doublet}=$literalhash{$doublet};
           }
        if (exists $doubhash{$doublet})
            {
            if ($doubline ne "")
              {
              $doubline = $doubline . " $hoparray[$i+1]";
              if (exists $literalhash{$doubline})
                 {
                 $subhash{$doubline}=$literalhash{$doubline};
                 }
              if ($doubline =~ /^[a-z0-9\-]+ /)
                 {
                 $endline = $';
                 if (exists $literalhash{$endline})
                    {
                    $subhash{$endline}=$literalhash{$endline};
                    }
                 }
              if ($doubline =~ / [a-z0-9\-]+$/)
                 {
                 $frontline = $`;
                 if (exists $literalhash{$frontline})
                    {
                    $subhash{$frontline}=$literalhash{$frontline};
                    }
                 }
              if ($doubline =~ /^[a-z0-9\-]+ (.+) [a-z0-9\-]+$/)
                 {
                 $midline = $1;
                 if (exists $literalhash{$midline})
                    {
                    $subhash{$midline}=$literalhash{$midline};
                    }
                 }
              }
            else
              {
              $doubline = $doublet;
              }
            }
        else
            {
            if ($doubline ne "")
               {
               if (exists $literalhash{$doubline})
                  {
                  $subhash{$doubline}=$literalhash{$doubline};
                  }
               if ($doubline =~ / [a-z0-9\-]+$/)
                  {
                  $frontline = $`;
                  if (exists $literalhash{$frontline})
                     {
                     $subhash{$frontline}=$literalhash{$frontline};
                     }
                  }
              if ($doubline =~ / [a-z0-9\-]+ [a-z0-9\-]+$/)
                 {
                 $smallfrontline = $`;
                 if (exists $literalhash{$smallfrontline})
                    {
                    $subhash{$smallfrontline}=$literalhash{$smallfrontline};
                    }
                 }
              if ($doubline =~ /^[a-z0-9\-]+ /)
                 {
                 $endline = $';
                 if (exists $literalhash{$endline})
                    {
                    $subhash{$endline}=$literalhash{$endline};
                    }
                 }
              if ($doubline =~ /^[a-z0-9\-]+ [a-z0-9\-]+ /)
                 {
                 $smallendline = $';
                 if (exists $literalhash{$smallendline})
                    {
                    $subhash{$smallendline}=$literalhash{$smallendline};
                    }
                 }
              if ($doubline =~ /^[a-z0-9\-]+ (.+) [a-z0-9\-]+$/)
                 {
                 $midline = $1;
                 if (exists $literalhash{$midline})
                    {
                    $subhash{$midline}=$literalhash{$midline};
                    }
                 }
               $doubline = "";
               }
            }
        }
        if (scalar(keys(%subhash)) == 0)
           {
           undef %subhash;
           next;
           }
        $tumorrecordcount++;
        print OUT "$tumorrecordcount\: $hold";
        my %smallvaluecount;
        while ((my $key, my $value) = each(%subhash))
           {
           print OUT "        \\n";
           if (exists $neoself{$value}) 
              {
              my $type = $neoself{$value};
              $valuecount{$type}++;
              $smallvaluecount{$type}++;
              }
           }
        if (scalar(keys(%smallvaluecount)) == 2)
           {
           if (exists $smallvaluecount{"primitive"})
              {
              undef %smallvaluecount;
              next;
              }
           print BAD "\n";
           print BAD $hold;
           foreach $letter (sort (keys(%smallvaluecount)))
               {
               print TWO "$letter ";
               }
           print TWO "\n";
           while ((my $key, my $value) = each(%subhash))
              {
              print BAD "   $key $value $neoself{$value}\n";
              }
           while (($key, $value) = each(%smallvaluecount))
               {
               print BAD "   $key => $value\n";
               }
           }
        undef %smallvaluecount;
        undef %subhash;
    print OUT"    \<\/rdf\:Description\>\n";
    }
  }

print OUT "\<\/rdf\:RDF\>";
close TEXT;
close OUT;
$end = time();
$totaltime = $end - $start;
print "\nThe time to code was $totaltime seconds\n";
print "The number of records in omim is $recordcount\n";
print "The number of tumorous records in omim is $tumorrecordcount\n";
print "The number of primitive tumors listed in omim is " . $valuecount{"primitive"} . "\n";
print "The number of endoderm_or_ectoderm tumors listed in omim is " .  $valuecount{"endoderm_or_ectoderm"} . "\n";
print "The number of mesoderm tumors listed in omim is " . $valuecount{"mesoderm"} . "\n";
print "The number of neuroectoderm tumors listed in omim is " . $valuecount{"neuroectoderm"} . "\n";
exit;


Last modified: January 6, 2008



biomedical informatics cover Perl Programming for Medicine and Biology Cover Ruby for Medicine and Biology Cover Ruby: The Programming Language