Parent

Class/Module Index [+]

Quicksearch

CharDet::UniversalDetector

Attributes

result[RW]

Public Class Methods

new() click to toggle source
# File lib/tmail/vendor/rchardet-1.3/lib/rchardet/universaldetector.rb, line 38
def initialize
  @_highBitDetector = /[\x80-\xFF]/
  @_escDetector = /(\0033|\~\{)/
  @_mEscCharSetProber = nil
  @_mCharSetProbers = []
  reset()
end

Public Instance Methods

close() click to toggle source
# File lib/tmail/vendor/rchardet-1.3/lib/rchardet/universaldetector.rb, line 134
def close
  return if @done
  if not @_mGotData
    $stderr << "no data received!\n" if $debug
    return
  end
  @done = true
    
  if @_mInputState == EPureAscii  
    @result = {'encoding' => 'ascii', 'confidence' => 1.0}
    return @result
  end
    
  if @_mInputState == EHighbyte
    confidences = {}
    @_mCharSetProbers.each{ |prober| confidences[prober] = prober.get_confidence }
    maxProber = @_mCharSetProbers.max{ |a,b| confidences[a] <=> confidences[b] }
    if maxProber and maxProber.get_confidence > MINIMUM_THRESHOLD
      @result = {'encoding' =>  maxProber.get_charset_name(),
                 'confidence' =>  maxProber.get_confidence()}
      return @result
    end
  end

  if $debug
    $stderr << "no probers hit minimum threshhold\n" if $debug
    for prober in @_mCharSetProbers[0]._mProbers
      next if not prober
      $stderr << "#{prober.get_charset_name} confidence = #{prober.get_confidence}\n" if $debug
    end
  end
end
feed(aBuf) click to toggle source
# File lib/tmail/vendor/rchardet-1.3/lib/rchardet/universaldetector.rb, line 61
def feed(aBuf)
  return if @done

  aLen = aBuf.length
  return if not aLen

  if not @_mGotData
    # If the data starts with BOM, we know it is UTF
    if aBuf[0...3] == "\xEF\xBB\xBF"
      # EF BB BF  UTF-8 with BOM
      @result = {'encoding' => "UTF-8", 'confidence' => 1.0}
    elsif aBuf[0...4] == "\xFF\xFE\x00\x00"
      # FF FE 00 00  UTF-32, little-endian BOM
      @result = {'encoding' => "UTF-32LE", 'confidence' => 1.0}
    elsif aBuf[0...4] == "\x00\x00\xFE\xFF"
      # 00 00 FE FF  UTF-32, big-endian BOM
      @result = {'encoding' => "UTF-32BE", 'confidence' => 1.0}
    elsif aBuf[0...4] == "\xFE\xFF\x00\x00"
      # FE FF 00 00  UCS-4, unusual octet order BOM (3412)
      @result = {'encoding' => "X-ISO-10646-UCS-4-3412", 'confidence' => 1.0}
    elsif aBuf[0...4] == "\x00\x00\xFF\xFE"
      # 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
      @result = {'encoding' =>  "X-ISO-10646-UCS-4-2143", 'confidence' =>  1.0}
    elsif aBuf[0...2] == "\xFF\xFE"
      # FF FE  UTF-16, little endian BOM
      @result = {'encoding' =>  "UTF-16LE", 'confidence' =>  1.0}
    elsif aBuf[0...2] == "\xFE\xFF"
      # FE FF  UTF-16, big endian BOM
      @result = {'encoding' =>  "UTF-16BE", 'confidence' =>  1.0}
    end
  end
     
  @_mGotData = true
  if @result['encoding'] and (@result['confidence'] > 0.0)  
    @done = true
    return
  end
  
  if @_mInputState == EPureAscii
    if @_highBitDetector =~ (aBuf)
      @_mInputState = EHighbyte
    elsif (@_mInputState == EPureAscii) and @_escDetector =~ (@_mLastChar + aBuf)
      @_mInputState = EEscAscii
    end
  end
    
  @_mLastChar = aBuf[-1..-1]
  if @_mInputState == EEscAscii
    if not @_mEscCharSetProber
      @_mEscCharSetProber = EscCharSetProber.new()
    end
    if @_mEscCharSetProber.feed(aBuf) == EFoundIt
      @result = {'encoding' =>  self._mEscCharSetProber.get_charset_name(),
                 'confidence' =>  @_mEscCharSetProber.get_confidence()
      }
      @done = true
    end
  elsif @_mInputState == EHighbyte
    if not @_mCharSetProbers or @_mCharSetProbers.empty?
      @_mCharSetProbers = [MBCSGroupProber.new(), SBCSGroupProber.new(), Latin1Prober.new()]
    end
    for prober in @_mCharSetProbers
      if prober.feed(aBuf) == EFoundIt
        @result = {'encoding' =>  prober.get_charset_name(),
                   'confidence' =>  prober.get_confidence()}
        @done = true
        break
      end
    end
  end
    
end
reset() click to toggle source
# File lib/tmail/vendor/rchardet-1.3/lib/rchardet/universaldetector.rb, line 46
def reset
  @result = {'encoding' => nil, 'confidence' => 0.0}
  @done = false
  @_mStart = true
  @_mGotData = false
  @_mInputState = EPureAscii
  @_mLastChar = ''
  if @_mEscCharSetProber
    @_mEscCharSetProber.reset()
  end
  for prober in @_mCharSetProbers
    prober.reset()
  end
end

[Validate]

Generated with the Darkfish Rdoc Generator 2.