Class | Bio::FlatFile::AutoDetect |
In: |
lib/bio/io/flatfile/autodetection.rb
|
Parent: | Object |
AutoDetect automatically determines database class of given data.
TopRule | = | RuleSpecial.new('top') | Special element that is always top priority. | |
BottomRule | = | RuleSpecial.new('bottom') | Special element that is always bottom priority. |
make a new autodetect object
# File lib/bio/io/flatfile/autodetection.rb, line 361 361: def self.[](*arg) 362: a = self.new 363: arg.each { |e| a.add(e) } 364: a 365: end
returns the default autodetect object
# File lib/bio/io/flatfile/autodetection.rb, line 348 348: def self.default 349: unless @default then 350: @default = self.make_default 351: end 352: @default 353: end
sets the default autodetect object.
# File lib/bio/io/flatfile/autodetection.rb, line 356 356: def self.default=(ad) 357: @default = ad 358: end
make a default of default autodetect object
# File lib/bio/io/flatfile/autodetection.rb, line 368 368: def self.make_default 369: a = self[ 370: genbank = RuleRegexp[ 'Bio::GenBank', 371: /^LOCUS .+ bp .*[a-z]*[DR]?NA/ ], 372: genpept = RuleRegexp[ 'Bio::GenPept', 373: /^LOCUS .+ aa .+/ ], 374: medline = RuleRegexp[ 'Bio::MEDLINE', 375: /^PMID\- [0-9]+$/ ], 376: embl = RuleRegexp[ 'Bio::EMBL', 377: /^ID .+\; .*(DNA|RNA|XXX)\;/ ], 378: sptr = RuleRegexp2[ 'Bio::SPTR', 379: /^ID .+\; *PRT\;/, 380: /^ID [-A-Za-z0-9_\.]+ .+\; *[0-9]+ *AA\./ ], 381: prosite = RuleRegexp[ 'Bio::PROSITE', 382: /^ID [-A-Za-z0-9_\.]+\; (PATTERN|RULE|MATRIX)\.$/ ], 383: transfac = RuleRegexp[ 'Bio::TRANSFAC', 384: /^AC [-A-Za-z0-9_\.]+$/ ], 385: 386: aaindex = RuleProc.new('Bio::AAindex1', 'Bio::AAindex2') do |text| 387: if /^H [-A-Z0-9_\.]+$/ =~ text then 388: if text =~ /^M [rc]/ then 389: Bio::AAindex2 390: elsif text =~ /^I A\/L/ then 391: Bio::AAindex1 392: else 393: false #fail to determine 394: end 395: else 396: nil 397: end 398: end, 399: 400: litdb = RuleRegexp[ 'Bio::LITDB', 401: /^CODE [0-9]+$/ ], 402: pathway_module = RuleRegexp[ 'Bio::KEGG::MODULE', 403: /^ENTRY .+ Pathway\s+Module\s*/ ], 404: pathway = RuleRegexp[ 'Bio::KEGG::PATHWAY', 405: /^ENTRY .+ Pathway\s*/ ], 406: brite = RuleRegexp[ 'Bio::KEGG::BRITE', 407: /^Entry [A-Z0-9]+/ ], 408: orthology = RuleRegexp[ 'Bio::KEGG::ORTHOLOGY', 409: /^ENTRY .+ KO\s*/ ], 410: drug = RuleRegexp[ 'Bio::KEGG::DRUG', 411: /^ENTRY .+ Drug\s*/ ], 412: glycan = RuleRegexp[ 'Bio::KEGG::GLYCAN', 413: /^ENTRY .+ Glycan\s*/ ], 414: enzyme = RuleRegexp2[ 'Bio::KEGG::ENZYME', 415: /^ENTRY EC [0-9\.]+$/, 416: /^ENTRY .+ Enzyme\s*/ 417: ], 418: compound = RuleRegexp2[ 'Bio::KEGG::COMPOUND', 419: /^ENTRY C[A-Za-z0-9\._]+$/, 420: /^ENTRY .+ Compound\s*/ 421: ], 422: reaction = RuleRegexp2[ 'Bio::KEGG::REACTION', 423: /^ENTRY R[A-Za-z0-9\._]+$/, 424: /^ENTRY .+ Reaction\s*/ 425: ], 426: genes = RuleRegexp[ 'Bio::KEGG::GENES', 427: /^ENTRY .+ (CDS|gene|.*RNA|Contig) / ], 428: genome = RuleRegexp[ 'Bio::KEGG::GENOME', 429: /^ENTRY [a-z]+$/ ], 430: 431: fantom = RuleProc.new('Bio::FANTOM::MaXML::Cluster', 432: 'Bio::FANTOM::MaXML::Sequence') do |text| 433: if /\<\!DOCTYPE\s+maxml\-(sequences|clusters)\s+SYSTEM/ =~ text 434: case $1 435: when 'clusters' 436: Bio::FANTOM::MaXML::Cluster 437: when 'sequences' 438: Bio::FANTOM::MaXML::Sequence 439: else 440: nil #unknown 441: end 442: else 443: nil 444: end 445: end, 446: 447: pdb = RuleRegexp[ 'Bio::PDB', 448: /^HEADER .{40}\d\d\-[A-Z]{3}\-\d\d [0-9A-Z]{4}/ ], 449: het = RuleRegexp[ 'Bio::PDB::ChemicalComponent', 450: /^RESIDUE +.+ +\d+\s*$/ ], 451: 452: clustal = RuleRegexp2[ 'Bio::ClustalW::Report', 453: /^CLUSTAL .*\(.*\).*sequence +alignment/, 454: /^CLUSTAL FORMAT for T-COFFEE/ ], 455: 456: gcg_msf = RuleRegexp[ 'Bio::GCG::Msf', 457: /^!!(N|A)A_MULTIPLE_ALIGNMENT .+/ ], 458: 459: gcg_seq = RuleRegexp[ 'Bio::GCG::Seq', 460: /^!!(N|A)A_SEQUENCE .+/ ], 461: 462: blastxml = RuleRegexp[ 'Bio::Blast::Report', 463: /\<\!DOCTYPE BlastOutput PUBLIC / ], 464: wublast = RuleRegexp[ 'Bio::Blast::WU::Report', 465: /^BLAST.? +[\-\.\w]+\-WashU +\[[\-\.\w ]+\]/ ], 466: wutblast = RuleRegexp[ 'Bio::Blast::WU::Report_TBlast', 467: /^TBLAST.? +[\-\.\w]+\-WashU +\[[\-\.\w ]+\]/ ], 468: blast = RuleRegexp[ 'Bio::Blast::Default::Report', 469: /^BLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ], 470: tblast = RuleRegexp[ 'Bio::Blast::Default::Report_TBlast', 471: /^TBLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ], 472: rpsblast = RuleRegexp[ 'Bio::Blast::RPSBlast::Report', 473: /^RPS\-BLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ], 474: 475: blat = RuleRegexp[ 'Bio::Blat::Report', 476: /^psLayout version \d+/ ], 477: spidey = RuleRegexp[ 'Bio::Spidey::Report', 478: /^\-\-SPIDEY version .+\-\-$/ ], 479: hmmer = RuleRegexp[ 'Bio::HMMER::Report', 480: /^HMMER +\d+\./ ], 481: sim4 = RuleRegexp[ 'Bio::Sim4::Report', 482: /^seq1 \= .*\, \d+ bp(\r|\r?\n)seq2 \= .*\, \d+ bp(\r|\r?\n)/ ], 483: 484: fastq = RuleRegexp[ 'Bio::Fastq', 485: /^\@.+(?:\r|\r?\n)(?:[^\@\+].*(?:\r|\r?\n))+\+.*(?:\r|\r?\n).+(?:\r|\r?\n)/ ], 486: 487: fastaformat = RuleProc.new('Bio::FastaFormat', 488: 'Bio::NBRF', 489: 'Bio::FastaNumericFormat') do |text| 490: if /^>.+$/ =~ text 491: case text 492: when /^>([PF]1|[DR][LC]|N[13]|XX)\;.+/ 493: Bio::NBRF 494: when /^>.+$\s+(^\#.*$\s*)*^\s*\d*\s*[-a-zA-Z_\.\[\]\(\)\*\+\$]+/ 495: Bio::FastaFormat 496: when /^>.+$\s+^\s*\d+(\s+\d+)*\s*$/ 497: Bio::FastaNumericFormat 498: else 499: false 500: end 501: else 502: nil 503: end 504: end 505: ] 506: 507: # dependencies 508: # NCBI 509: genbank.is_prior_to genpept 510: # EMBL/UniProt 511: embl.is_prior_to sptr 512: sptr.is_prior_to prosite 513: prosite.is_prior_to transfac 514: # KEGG 515: #aaindex.is_prior_to litdb 516: #litdb.is_prior_to brite 517: pathway_module.is_prior_to pathway 518: pathway.is_prior_to brite 519: brite.is_prior_to orthology 520: orthology.is_prior_to drug 521: drug.is_prior_to glycan 522: glycan.is_prior_to enzyme 523: enzyme.is_prior_to compound 524: compound.is_prior_to reaction 525: reaction.is_prior_to genes 526: genes.is_prior_to genome 527: # PDB 528: pdb.is_prior_to het 529: # BLAST 530: wublast.is_prior_to wutblast 531: wutblast.is_prior_to blast 532: blast.is_prior_to tblast 533: # Fastq 534: BottomRule.is_prior_to(fastq) 535: fastq.is_prior_to(fastaformat) 536: # FastaFormat 537: BottomRule.is_prior_to(fastaformat) 538: 539: # for debug 540: #debug_first = RuleDebug.new('debug_first') 541: #a.add(debug_first) 542: #debug_first.is_prior_to(TopRule) 543: 544: ## for debug 545: #debug_last = RuleDebug.new('debug_last') 546: #a.add(debug_last) 547: #BottomRule.is_prior_to(debug_last) 548: #fastaformat.is_prior_to(debug_last) 549: 550: a.rehash 551: return a 552: end
Autodetect from the text. Returns a database class if succeeded. Returns nil if failed.
# File lib/bio/io/flatfile/autodetection.rb, line 305 305: def autodetect(text, meta = {}) 306: r = nil 307: elements.each do |e| 308: #$stderr.puts e.name 309: r = e.guess(text, meta) 310: break if r 311: end 312: r 313: end
autodetect from the FlatFile object. Returns a database class if succeeded. Returns nil if failed.
# File lib/bio/io/flatfile/autodetection.rb, line 318 318: def autodetect_flatfile(ff, lines = 31) 319: meta = {} 320: stream = ff.instance_eval { @stream } 321: begin 322: path = stream.path 323: rescue NameError 324: end 325: if path then 326: meta[:path] = path 327: # call autodetect onece with meta and without any read action 328: if r = self.autodetect(stream.prefetch_buffer, meta) 329: return r 330: end 331: end 332: # reading stream 333: 1.upto(lines) do |x| 334: break unless line = stream.prefetch_gets 335: if line.strip.size > 0 then 336: if r = self.autodetect(stream.prefetch_buffer, meta) 337: return r 338: end 339: end 340: end 341: return nil 342: end
Iterates over each element.
# File lib/bio/io/flatfile/autodetection.rb, line 298 298: def each_rule(&x) #:yields: elem 299: elements.each(&x) 300: end
visualizes the object (mainly for debug)
# File lib/bio/io/flatfile/autodetection.rb, line 291 291: def inspect 292: "<#{self.class.to_s} " + 293: self.elements.collect { |e| e.name.inspect }.join(' ') + 294: ">" 295: end
rebuilds the object and clears internal cache.
# File lib/bio/io/flatfile/autodetection.rb, line 285 285: def rehash 286: @rules.rehash 287: @elements = nil 288: end
(required by TSort.) For a given element, yields each child (= lower priority elements) of the element.
# File lib/bio/io/flatfile/autodetection.rb, line 253 253: def tsort_each_child(elem) 254: if elem == TopRule then 255: @rules.each_value do |e| 256: yield e unless e == TopRule or 257: e.lower_priority_elements.index(TopRule) 258: end 259: elsif elem == BottomRule then 260: @rules.each_value do |e| 261: yield e if e.higher_priority_elements.index(BottomRule) 262: end 263: else 264: elem.lower_priority_elements.each do |e| 265: yield e if e != BottomRule 266: end 267: unless elem.higher_priority_elements.index(BottomRule) 268: yield BottomRule 269: end 270: end 271: end