#!/usr/bin/perl #validtma.pl use XML::Parser; $opening = <<"EOF"; This script was created by Jules J. Berman on Jan 15, 2002. The purpose of this script is to validate Tissue Microarray (TMA) files that were created using the Common Data Elements (CDEs) and sematics rules developed by the Association for Pathology Informatics TMA Data Exchange Specification This script requires the following two external Perl modules: XML::Parser and MD5, both of which can be downloaded from CPAN (Comprehensive Perl Archive Network) at www.cpan.org The specification and all necessary URIs are listed and described in a recently submitted manuscript. All publications that refer to or derive from the TMA Data Exchange Specification should cite the following: Berman JJ, Edgerton ME, Friedman B. The Tissue Microarray Data Exchange Specification: A Community-based, Open Source Tool for Sharing Tissue Microarray Data. In preparation, BMC Medical Informatics and Decision Making, 2003. This script may be used freely. Bugs should be reported to Jules Berman at bermanj\@mail.nih.gov The software is provided "as is", without warranty of any kind, express or implied, including but not limited to the warranties of merchantability, fitness for a particular purpose and noninfringement. in no event shall the authors or copyright holders be liable for any claim, damages or other liability, whether in an action of contract, tort or otherwise, arising from, out of or in connection with the software or the use or other dealings in the software. Enter a file name you would like to validate. The file must be in the same directory as the validator script, validtma.pl and must end with the extension .xml or .XML The first part of the filename, prior to the extension may not contain spaces and must be 8 characters or less in length Enter your filename now EOF print $opening; my %index = &build_cde; my $line = " "; my $parser = XML::Parser->new( Handlers => { Init => \&handle_doc_start, Final => \&handle_doc_end, Start => \&handle_elem_start, End => \&handle_elem_end, Char => \&handle_char_data, }); my %filetags; $file = ; $file =~ s/\n//; if ($file eq "") { die "You didn't enter a filename. Try again later.\n"; } $parser -> parsefile($file); sub handle_doc_start { print "\nBegining to parse $file now\n\n"; } sub handle_doc_end { exists($filetags{"header"})||die "No header tag. TMA file is invalid.\n"; exists($filetags{"tma"})||die "No tma tag. TMA file is invalid.\n"; exists($filetags{"histo"})||die "No histo tag. TMA file is invalid.\n"; exists($filetags{"slide"})||die "No slide tag. TMA file is invalid.\n"; exists($filetags{"core"})||die "No core tag. TMA file is invalid.\n"; exists($filetags{"block"})||die "No block tag. TMA file is invalid.\n"; print "\nFinished. $file is a valid Tissue Microarray File.\n" } my @build; sub handle_elem_start { my $count = 0; my $previous; my ($expat, $name, %atts) = @_; $name =~ s/^[0-9a-z\-\_]*\://i; if (exists $index{$name}) { $filetags{$name}++; print "$name\n"; if ($name eq "histo") { if (scalar(@build) > 0) { die "Your first element must be the root element, histo\n"; } undef @build; unshift (@build, $name) } if ($name eq "tma") { $previous = shift(@build); if ($previous ne "histo") { die "You have a tma element not preceded by a histo element\n"; } else { unshift (@build, $name) } } if ($name eq "header") { $previous = shift(@build); if ($previous ne "tma") { die "You have a header element not preceded by a tma element\n"; } else { unshift (@build, $name); } } if ($name eq "block") { $previous = shift(@build); if ($previous ne "header") { die "You have a block element not preceded by a header element\n"; } else { unshift (@build, $name); } } if ($name eq "slide") { $previous = shift(@build); unshift(@build, $previous); #lets's put this back in the pile if (($previous eq "block") || ($previous eq "core")|| ($previous eq "slide")) { unshift (@build, $name); } else { die "You have a slide element not preceded by a block or core or slide element\n"; } } if ($name eq "core") { $previous = shift(@build); unshift(@build, $previous); if (($previous eq "block") || ($previous eq "slide")|| ($previous eq "core")) { unshift (@build, $name); } else { die "You have a core element not preceded by a block or slide or core element\n"; } } if ($name =~ /\_/) { my @name_array = split(/\_/, $name); my $last = pop(@name_array); my $next_to_last = pop(@name_array); $previous = shift (@build); if ($last eq $previous) { unshift (@build,$previous); #The element has been repeated return; #That's ok. Just put it back in the } #build array and return if ($previous eq $next_to_last) { unshift (@build, $previous); #the element has a new hierarchical unshift (@build, $last); #extension. add the previous last return; #and the new last to build array } #and return $previous = shift (@build); #let's look back one more in build array if ($last eq $previous) { unshift (@build,$previous); #The element has stepped back return; #one in the hierarchy. That's ok. Just put it back in the } #build array and return if ($previous eq $next_to_last) { unshift (@build, $previous); #the element has backed up one and unshift (@build, $last); #added a new extensions. Add the return; #previous and the last to the build array } #and return die "The element $name does not fit in the hierarchy\n"; } } } sub handle_elem_end { my($expat, $name) = @_; $name =~ s/^[0-9a-z\-\_]*\://i; if (exists $index{$name}) { if ($name eq "tma") { undef @build; unshift(@build, $name) } if ($name =~ /\_/) { my @name_array = split(/\_/, $name); my $last = pop(@name_array); $previous = shift (@build); unless ($last eq $previous) { unshift (@build, $previous) } } } } sub handle_char_data { my $expat = $_[0]; $text .= "$_[1]"; } use MD5; open (TEXT,"$file"); $context = new MD5; $context->addfile(TEXT); $digest = $context->digest(); $readable_digest = unpack ("H*", $digest); print "\nThe one-way hash of your file is $readable_digest"; sub build_cde { my %build_index = qw( histo 1 header 1 Title 1 Creator 1 SubjectandKeywords 1 Description 1 Publisher 1 Contributor 1 Date 1 ResourceType 1 Format 1 ResourceIdentifier 1 Source 1 Language 1 Relation 1 Coverage 1 RightsManagement 1 filename 1 tma 1 block 1 block_purpose 1 block_identifier 1 block_description 1 block_meshnumbers 1 block_related_blocks 1 block_creator 1 block_creation-date 1 block_protocol 1 block_number-of-cores 1 block_core-size 1 block_core-spacing 1 block_qa 1 block_status 1 block_array-hash 1 slide 1 slide_identifier 1 slide_cut-date 1 slide_storage 1 slide_section-thickness 1 slide_level 1 slide_sectioning-protocol 1 slide_test-date 1 slide_test-category 1 slide_test-protocol 1 slide_stain 1 slide_test-control 1 core 1 core_array-id 1 core_tissue-fixative 1 core_organism 1 core_demographics-record 1 core_anatomic-site 1 core_organ 1 core_tissue-origin 1 core_data-descriptor 1 core_data-descriptor_QC 1 core_clinical-info 1 core_clinical-info_id 1 core_clinical-info_availability 1 core_histo-repository 1 core_histo-repository_specimen-id 1 core_histo-repository_specimen-diagnosis 1 core_histo-repository_donor-block 1 core_histo-repository_donor-block_drill-site 1 core_histo-repository_donor-block_drill-site_diagnosis 1 core_results 1 core_results_slide-id 1 core_results_method-of-reading-data 1 core_results_number-of-nuclei-counted 1 core_results_tissue-intensity 1 core_results_percent-tissue-staining 1 core_results_compartment-staining 1 core_results_staining-pattern 1 core_image 1 core_image_slide-id 1 core_image_identifier 1 core_image_format 1 core_image_checksum 1 core_image_magnification 1 core_image_pixel_size 1); return %build_index; } exit;