#!/usr/bin/perl #omim2.pl #omimpre.pl #Extracting genetic conditions that predispose to cancer, using the #Online Mendelian Inheritance in Man #Jules J. Berman, June 24, 2003 #Submitted to APIII, October 7-10, Pittsburgh, PA. #BACKGROUND A complete terminology of lesions/conditions related #to cancer would contain: 1) a comprehensive nomenclature of tumors; #2) a comprehensive nomenclature of precancers (morphologically #identifiable lesions that precede the development of cancer); #3) a comprehensive nomenclature of acquired conditions that #increase the risk of cancer (e.g. AIDS, and radiation exposure); #and 4) a comprehensive nomenclature of genetic conditions that #predispose to cancer (such as Li-Framumeni syndrome and Xeroderma #Pigmentosum). A complete cancer terminology is currently unavailable #to researchers and pathologists.. The purpose of such a nomenclature #would be to facilitate the integration of biomedical data with #lesions of interest to cancer researchers. Data integration enables #researchers to discover the medical relevance of heterogeneous data #elements. The author has published informatics techniques used to #compile nomenclatures 1 and 2. This abstract describes a way of #compiling nomenclature 4, using the Online Mendelian Inheritance in #Man (OMIM). # #TECHNOLOGY: OMIM is a publicly available comprehensive and curated #collection of all inherited conditions in man. It can be downloaded #through anonymous ftp at: ftp.ncbi.nih.gov /repository/OMIM. The #June 23, 2003 OMIM file was used. This file is 87,722,918 bytes in #length and contains descriptions of 15,113 different inherited #conditions of man. Conditions that are associated with the development #of tumors are provided with a listing of the tumors that have been #reported. # #DESIGN: The Perl script (omimpre.pl) collects OMIM conditions #predisposed to neoplastic development. It extracts the following #information from OMIM records: 1) the OMIM number of the condition; #2) the name of the condition and its synonymous or closely related #terms; and 3) the names of tumors associated with the condition. #The script requires an external file (look-up list) containing a #comprehensive listing of neoplastic terms. Instructions for obtaining #such a file can be obtained from http://65.222.228.150/jjb/ca_terms.txt. #The extracted information is collected into an XML file. A version of #the raw XML output file can be downloaded from #htttp://65.222.228.150/jjb/omimpre.xml. # #RESULTS: The Perl script produces an output file in about 10 seconds #using a 1.6 GHz computer. The output contains 518 conditions. Lynch #cancer family syndrome, hereditary nonpolyposis colorectal cancer, #cheilitis glandularis, Pasini typ epidermolysis bullosa dystrophica, #hereditary desmoid disease, Aase-Smith syndrome, familial type thyroid #carcinoma, Michelin tire baby syndrome, Oslam syndrome, and Maffucci #syndrome are a small sampling of extracted conditions. # #CONCLUSION: A Perl script is entered into the public domain that #extracts from OMIM inherited conditions that predispose man to cancer. #The Perl script is available at: http://65.222.228.150/jjb/omimpre.txt #The output file is XML, supporting the facile integration of data #elements (such as the OMIM identifier and the names of tumors) with #other biological databases. The output file can be easily updated #with newer versions of OMIM. $statement_0 = < http://65.222.228.150/jjb/presum.xml Extracting cancer terms from publicly available nomenclatures http://65.222.228.150/jjb/ca_terms.txt The software is provided "as is", without warranty of any kind, express or implied, including but not limited to the warranties of merchantability, fitness for a particular purpose and noninfringement. in no event shall the authors or copyright holders be liable for any claim, damages or other liability, whether in an action of contract, tort or otherwise, arising from, out of or in connection with the software or the use or other dealings in the software. EOS print $statement_0; @startval = times(); my %remove = qw( carcinoma 1 men 1 dysgenesi 1 dysmorphism 1 malformation 1 deformity 1 malignant 1 an 1 cancer 1 breast 1 deformitie 1 diagnosi 1 deformity 1 chemotherapy 1 activity 1 acitivitie 1 tumor 1 all 1 abnormality 1 diagnosed 1 abnormalitie 1 ); $remove{"drug therapy"} = 1; $remove{"viral infection"} = 1; $remove{"cerebral sclerosi"} = 1; $remove{"congenital abnormality"} = 1; $remove{"congenital abnormalitie"} = 1; $remove{"abnormal development"} = 1; open (TEXT, "neoexp.txt"); my $line = " "; my %index; while ($line ne "") { $line = ; $line =~ /^([CJ]?[0-9\/\-]+) (.+)$/o; $code = $1; $term = $2; next if ($term =~ /[A-Z]{3}/); next if (exists ($remove{$term})); next if (length($term)<3); #print "The code is $1 and the term is $2\n"; $index{$term} = $code; } close TEXT; open (TEXT, "omim")||die"Cannot"; open (OUT, ">omimpre.xml"); my $realline = " "; $/ = "*RECORD*"; my $count = 0; $omimnumber = ""; $statement_1 = < EOS print OUT $statement_1; while ($realline ne "") { $realline = ; my $description; $line = $realline; if ($line =~ /\*FIELD\* NO[.\n]*([0-9]{6})[.\n]*\*FIELD\* TI/) { $omimnumber = $1; } else { next; } if ($line =~ /\*FIELD\* TI/) { $description = $'; } if ($description =~ /\*FIELD\* TX/) { $description = $`; $description =~ s/[\#\*]//mg; $description =~ /([0-9]{6})([A-Z\n \;\,\-0-9ut\.]+)/; $description = $2; if ($line =~ /FIELD MN/) { $description = $`; } $description =~ s/^\n+//; $description =~ s/\n*$/\;/; $description =~ s/([\d\w\, \-\.\n]+)\;+/\n\$1\<\/omim_name\>/g; $description =~ s/\n+/\n/; } $line = $realline; $line =~ /\*FIELD\* CD/; $line = $`; #$print $line; my @linearray = split(/\n/, $line); my $total = ""; my $output = ""; foreach my $value (@linearray) { if ($value =~ /^ [\w\s]+/) #all the listed conditions start on a line that starts with three spaces { $value = lc($value); $value =~ tr/0-9a-zA-Z.\n' \-\)\(/ /c; #replaces with a space everything that is not a letter, number period, line-break, apostrophe, space or parenthesis $value =~ s/^ +//o; $value =~ s/ +$//o; $value =~ s/ +/ /g; #$value =~ s/\b([\w\s])+s\b/$1/g; $output = &parse($value); $total = $total . $output; } } if ($total ne "") { $count++; my @totalarray = split(/\n/,$total); my %tumorindex; foreach my $thing (@totalarray) { $tumorindex{$thing} = 1; } print "$count\n"; print OUT "\n"; print OUT "\$count\<\/entry\_number\>\n"; print OUT "\$omimnumber\<\/omim\_number\>\n"; print OUT "$description\n"; while (($key, $value) = each(%tumorindex)) { print OUT "\$key\<\/neoplasm\>\n"; } print OUT "\<\/omim\>\n"; } } $statement_3 = < EOS print OUT $statement_3; @stopval = times(); $totaltime = ($stopval[0]-$startval[0]); print "\nThe total time for script was $totaltime"; print "\nThe output of this script is the file omimpre.xml"; sub parse { my $hopline = @_[0]; my @hoparray = split(/ /,$hopline); my $arraysize = scalar(@hoparray); my $segment; for (my $n=0;$n<5;$n++) { for (my $i=0;$i<$arraysize;$i++) { if ($n == 0) { $segment = "$hoparray[$i] $hoparray[$i+1] $hoparray[$i+2] $hoparray[$i+3]"; } if ($n == 1) { $segment = "$hoparray[$i] $hoparray[$i+1] $hoparray[$i+2]"; } if ($n == 2) { $segment = "$hoparray[$i] $hoparray[$i+1]"; } if ($n == 3) { $segment = "$hoparray[$i]"; } my $quicksegment = $segment; $quicksegment =~ s/\b([a-z]+)s\b/$1/g; #takes the s off the ends of plurals if (exists $index{$quicksegment}) #this compensates for the sdbm hash { #which has also removed terminal s $printhash{"$segment|$index{$quicksegment}"}++; $segment = "$` $'"; } } } my $key; my $output = ""; foreach $key (sort keys %printhash) { $output = $output . "$key\n"; } #print "\n"; undef %printhash; undef @hoparray; return $output; } exit;