Class Bio::SOFT
In: lib/bio/db/soft.rb
Parent: Object

bio/db/soft.rb - Interface for SOFT formatted files

Author:Trevor Wennblom <trevor@corevx.com>
Copyright:Copyright (c) 2007 Midwinter Laboratories, LLC (midwinterlabs.com)
License:The Ruby License

Description

"SOFT (Simple Omnibus in Text Format) is a compact, simple, line-based, ASCII text format that incorporates experimental data and metadata." — GEO, National Center for Biotechnology Information

The Bio::SOFT module reads SOFT Series or Platform formatted files that contain information describing one database, one series, one platform, and many samples (GEO accessions). The data from the file can then be viewed with Ruby methods.

Bio::SOFT also supports the reading of SOFT DataSet files which contain one database, one dataset, and many subsets.

Format specification is located here:

SOFT data files may be directly downloaded here:

NCBI‘s Gene Expression Omnibus (GEO) is here:

Usage

If an attribute has more than one value then the values are stored in an Array of String objects. Otherwise the attribute is stored as a String.

The platform and each sample may contain a table of data. A dataset from a DataSet file may also contain a table.

Attributes are dynamically created based on the data in the file. Predefined keys have not been created in advance due to the variability of SOFT files in-the-wild.

Keys are generally stored as Symbols. In the case of keys for samples and table headings may alternatively be accessed with Strings. The names of samples (geo accessions) are case sensitive. Table headers are case insensitive.

  require 'bio'

  lines = IO.readlines('GSE3457_family.soft')
  soft = Bio::SOFT.new(lines)

  soft.platform[:geo_accession]             # => "GPL2092"
  soft.platform[:organism]                  # => "Populus"
  soft.platform[:contributor]               # => ["Jingyi,,Li", "Olga,,Shevchenko", "Steve,H,Strauss", "Amy,M,Brunner"]
  soft.platform[:data_row_count]            # => "240"
  soft.platform.keys.sort {|a,b| a.to_s <=> b.to_s}[0..2] # => [:contact_address, :contact_city, :contact_country]
  soft.platform[:"contact_zip/postal_code"] # => "97331"
  soft.platform[:table].header              # => ["ID", "GB_ACC", "SPOT_ID", "Function/Family", "ORGANISM", "SEQUENCE"]
  soft.platform[:table].header_description  # => {"ORGANISM"=>"sequence sources", "SEQUENCE"=>"oligo sequence used", "Function/Family"=>"gene functions and family", "ID"=>"", "SPOT_ID"=>"", "GB_ACC"=>"Gene bank accession number"}
  soft.platform[:table].rows.size           # => 240
  soft.platform[:table].rows[5]             # => ["A039P68U", "AI163321", "", "TF, flowering protein CONSTANS", "P. tremula x P. tremuloides", "AGAAAATTCGATATACTGTCCGTAAAGAGGTAGCACTTAGAATGCAACGGAATAAAGGGCAGTTCACCTC"]
  soft.platform[:table].rows[5][4]          # => "P. tremula x P. tremuloides"
  soft.platform[:table].rows[5][:organism]  # => "P. tremula x P. tremuloides"
  soft.platform[:table].rows[5]['ORGANISM'] # => "P. tremula x P. tremuloides"

  soft.series[:geo_accession]               # => "GSE3457"
  soft.series[:contributor]                 # => ["Jingyi,,Li", "Olga,,Shevchenko", "Ove,,Nilsson", "Steve,H,Strauss", "Amy,M,Brunner"]
  soft.series[:platform_id]                 # => "GPL2092"
  soft.series[:sample_id].size              # => 74
  soft.series[:sample_id][0..4]             # => ["GSM77557", "GSM77558", "GSM77559", "GSM77560", "GSM77561"]

  soft.database[:name]                      # => "Gene Expression Omnibus (GEO)"
  soft.database[:ref]                       # => "Nucleic Acids Res. 2005 Jan 1;33 Database Issue:D562-6"
  soft.database[:institute]                 # => "NCBI NLM NIH"

  soft.samples.size                         # => 74
  soft.samples[:GSM77600][:series_id]       # => "GSE3457"
  soft.samples['GSM77600'][:series_id]      # => "GSE3457"
  soft.samples[:GSM77600][:platform_id]     # => "GPL2092"
  soft.samples[:GSM77600][:type]            # => "RNA"
  soft.samples[:GSM77600][:title]           # => "jst2b2"
  soft.samples[:GSM77600][:table].header    # => ["ID_REF", "VALUE"]
  soft.samples[:GSM77600][:table].header_description # => {"ID_REF"=>"", "VALUE"=>"normalized signal intensities"}
  soft.samples[:GSM77600][:table].rows.size # => 217
  soft.samples[:GSM77600][:table].rows[5]   # => ["A039P68U", "8.19"]
  soft.samples[:GSM77600][:table].rows[5][0]        # => "A039P68U"
  soft.samples[:GSM77600][:table].rows[5][:id_ref]  # => "A039P68U"
  soft.samples[:GSM77600][:table].rows[5]['ID_REF'] # => "A039P68U"

  lines = IO.readlines('GDS100.soft')
  soft = Bio::SOFT.new(lines)

  soft.database[:name]                      # => "Gene Expression Omnibus (GEO)"
  soft.database[:ref]                       # => "Nucleic Acids Res. 2005 Jan 1;33 Database Issue:D562-6"
  soft.database[:institute]                 # => "NCBI NLM NIH"

  soft.subsets.size                         # => 8
  soft.subsets.keys                         # => ["GDS100_1", "GDS100_2", "GDS100_3", "GDS100_4", "GDS100_5", "GDS100_6", "GDS100_7", "GDS100_8"]
  soft.subsets[:GDS100_7]                   # => {:dataset_id=>"GDS100", :type=>"time", :sample_id=>"GSM548,GSM543", :description=>"60 minute"}
  soft.subsets['GDS100_7'][:sample_id]      # => "GSM548,GSM543"
  soft.subsets[:GDS100_7][:sample_id]       # => "GSM548,GSM543"
  soft.subsets[:GDS100_7][:dataset_id]      # => "GDS100"

  soft.dataset[:order]                      # => "none"
  soft.dataset[:sample_organism]            # => "Escherichia coli"
  soft.dataset[:table].header               # => ["ID_REF", "IDENTIFIER", "GSM549", "GSM542", "GSM543", "GSM547", "GSM544", "GSM545", "GSM546", "GSM548"]
  soft.dataset[:table].rows.size            # => 5764
  soft.dataset[:table].rows[5]              # => ["6", "EMPTY", "0.097", "0.217", "0.242", "0.067", "0.104", "0.162", "0.104", "0.154"]
  soft.dataset[:table].rows[5][4]           # => "0.242"
  soft.dataset[:table].rows[5][:gsm549]     # => "0.097"
  soft.dataset[:table].rows[5][:GSM549]     # => "0.097"
  soft.dataset[:table].rows[5]['GSM549']    # => "0.097"

Methods

Constants

LINE_TYPE_ENTITY_INDICATOR = '^'
LINE_TYPE_ENTITY_ATTRIBUTE = '!'
LINE_TYPE_TABLE_HEADER = '#'
TABLE_COLUMN_DELIMITER = "\t"   data table row defined by absence of line type character

Attributes

database  [RW] 
dataset  [RW] 
platform  [RW] 
samples  [RW] 
series  [RW] 
subsets  [RW] 

Public Class methods

Constructor


Arguments

  • lines: (required) contents of SOFT formatted file
Returns:Bio::SOFT

[Source]

     # File lib/bio/db/soft.rb, line 147
147:   def initialize(lines=nil)
148:     @database = Database.new
149:     
150:     @series = Series.new
151:     @platform = Platform.new
152:     @samples = Samples.new
153:     
154:     @dataset = Dataset.new
155:     @subsets = Subsets.new
156:     
157:     process(lines)
158:   end

Protected Instance methods

[Source]

     # File lib/bio/db/soft.rb, line 381
381:   def custom_raise( line_number_with_0_based_indexing, msg )
382:     raise ["Error processing input line: #{line_number_with_0_based_indexing+1}",
383:       msg].join("\t")
384:   end

[Source]

     # File lib/bio/db/soft.rb, line 354
354:   def error_msg( i, extra_info=nil )
355:     case i
356:     when 10
357:       x = ["Lines without line-type characters are rows in a table, but",
358:       "a line containing an entity indicator such as",
359:       "\"#{LINE_TYPE_ENTITY_INDICATOR}SAMPLE\",",
360:       "\"#{LINE_TYPE_ENTITY_INDICATOR}PLATFORM\",",
361:       "or \"#{LINE_TYPE_ENTITY_INDICATOR}DATASET\" has not been",
362:       "previously encountered or it does not appear that this line is",
363:       "in a table."]
364:     when 20
365:       # tables are allowed inside samples and platforms
366:       x = ["Tables are only allowed inside SAMPLE and PLATFORM.",
367:         "Current table information found inside #{extra_info}."]
368:     when 30
369:       x = ["Entity attribute line (\"#{LINE_TYPE_ENTITY_ATTRIBUTE}\")",
370:         "found before entity indicator line (\"#{LINE_TYPE_ENTITY_INDICATOR}\")"]
371:     when 40
372:       x = ["Unkown entity indicator.  Must be DATABASE, SAMPLE, PLATFORM,",
373:         "SERIES, DATASET, or SUBSET."]
374:     else
375:       raise IndexError, "Unknown error message requested."
376:     end
377:     
378:     x.join(" ")
379:   end

[Source]

     # File lib/bio/db/soft.rb, line 272
272:   def process(lines)
273:     current_indicator = nil
274:     current_class_accessor = nil
275:     in_table = false
276:         
277:     lines.each_with_index do |line, line_number|
278:       line.strip!
279:       next if line.nil? or line.empty?
280:       case line[0].chr
281:       when LINE_TYPE_ENTITY_INDICATOR
282:         current_indicator, value = split_label_value_in( line[1..-1] )
283: 
284:         case current_indicator
285:         when 'DATABASE'
286:           current_class_accessor = @database
287:         when 'DATASET'
288:           current_class_accessor = @dataset
289:         when 'PLATFORM'
290:           current_class_accessor = @platform
291:         when 'SERIES'
292:           current_class_accessor = @series
293:         when 'SAMPLE'
294:           @samples[value] = Sample.new
295:           current_class_accessor = @samples[value]
296:         when 'SUBSET'
297:           @subsets[value] = Subset.new
298:           current_class_accessor = @subsets[value]
299:         else
300:           custom_raise( line_number, error_msg(40, line) )
301:         end
302:           
303:       when LINE_TYPE_ENTITY_ATTRIBUTE
304:         if( current_indicator == nil )
305:           custom_raise( line_number, error_msg(30) )
306:         end
307:         
308:         # Handle lines such as '!platform_table_begin' and '!platform_table_end'
309:         if in_table
310:           if line =~ %r{table_begin}
311:             next
312:           elsif line =~ %r{table_end}
313:             in_table = false
314:             next
315:           end
316:         end
317:         
318:         key, value = split_label_value_in( line, true )
319:         key_s = key.to_sym
320:         
321:         if current_class_accessor.include?( key_s )
322:           if current_class_accessor[ key_s ].class != Array
323:             current_class_accessor[ key_s ] = [ current_class_accessor[ key_s ] ]
324:           end
325:           current_class_accessor[key.to_sym] << value
326:         else
327:           current_class_accessor[key.to_sym] = value
328:         end
329:         
330:       when LINE_TYPE_TABLE_HEADER
331:         if( (current_indicator != 'SAMPLE') and (current_indicator != 'PLATFORM') and (current_indicator != 'DATASET') )
332:           custom_raise( line_number, error_msg(20, current_indicator.inspect) )
333:         end
334:         
335:         in_table = true   # may be redundant, computationally not worth checking
336: 
337:         # We only expect one table per platform or sample
338:         current_class_accessor[:table] ||= Table.new
339:         key, value = split_label_value_in( line )
340:         # key[1..-1] -- Remove first character which is the LINE_TYPE_TABLE_HEADER
341:         current_class_accessor[:table].header_description[ key[1..-1] ] = value
342:         
343:       else
344:         # Type: No line type - should be a row in a table.
345:         
346:         if( (current_indicator == nil) or (in_table == false) )
347:           custom_raise( line_number, error_msg(10) )
348:         end
349:         current_class_accessor[:table].add_header_or_row( line )
350:       end
351:     end
352:   end

[Source]

     # File lib/bio/db/soft.rb, line 386
386:   def split_label_value_in( line, shift_key=false )
387:     line =~ %r{\s*=\s*}
388:     key, value = $`, $'
389:     
390:     if shift_key
391:       key =~ %r{_}
392:       key = $'
393:     end
394:     
395:     if( (key == nil) or (value == nil) )
396:       puts line.inspect
397:       raise
398:     end
399:     
400:     [key, value]
401:   end

[Validate]