Package translate :: Package lang :: Module ngram
[hide private]
[frames] | no frames]

Source Code for Module translate.lang.ngram

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright (c) 2006 Thomas Mangin 
  5  # Copyright (c) 2009 Zuza Software Foundation 
  6  # 
  7  # This program is distributed under Gnu General Public License 
  8  # (cf. the file COPYING in distribution). Alternatively, you can use 
  9  # the program under the conditions of the Artistic License (as Perl). 
 10  # 
 11  # This program is free software; you can redistribute it and/or modify 
 12  # it under the terms of the GNU General Public License as published by 
 13  # the Free Software Foundation; either version 2 of the License, or 
 14  # (at your option) any later version. 
 15  # 
 16  # This program is distributed in the hope that it will be useful, 
 17  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 18  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 19  # GNU General Public License for more details. 
 20  # 
 21  # You should have received a copy of the GNU General Public License 
 22  # along with this program; if not, write to the Free Software 
 23  # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 
 24  # 
 25  # Orignal file from http://thomas.mangin.me.uk/data/source/ngram.py 
 26   
 27  import re 
 28   
 29  nb_ngrams = 400 
 30   
 31   
32 -class _NGram:
33
34 - def __init__(self, arg={}):
35 if isinstance(arg, basestring): 36 self.addText(arg) 37 self.normalise() 38 elif isinstance(arg, dict): 39 self.ngrams = arg 40 self.normalise() 41 else: 42 self.ngrams = dict()
43
44 - def addText(self, text):
45 if isinstance(text, str): 46 text = text.decode('utf-8') 47 48 ngrams = dict() 49 50 text = text.replace('\n', ' ') 51 text = re.sub('\s+', ' ', text) 52 words = text.split(' ') 53 54 for word in words: 55 word = '_' + word + '_' 56 size = len(word) 57 for i in xrange(size): 58 for s in (1, 2, 3, 4): 59 sub = word[i:i + s] 60 if not sub in ngrams: 61 ngrams[sub] = 0 62 ngrams[sub] += 1 63 64 if i + s >= size: 65 break 66 self.ngrams = ngrams 67 return self
68
69 - def sorted(self):
70 sorted = [(self.ngrams[k], k) for k in self.ngrams.keys()] 71 sorted.sort() 72 sorted.reverse() 73 sorted = sorted[:nb_ngrams] 74 return sorted
75
76 - def normalise(self):
77 count = 0 78 ngrams = {} 79 for v, k in self.sorted(): 80 ngrams[k] = count 81 count += 1 82 83 self.ngrams = ngrams 84 return self
85
86 - def addValues(self, key, value):
87 self.ngrams[key] = value 88 return self
89
90 - def compare(self, ngram):
91 d = 0 92 ngrams = ngram.ngrams 93 for k in self.ngrams.keys(): 94 if k in ngrams: 95 d += abs(ngrams[k] - self.ngrams[k]) 96 else: 97 d += nb_ngrams 98 return d
99 100 101 import os 102 import glob 103 104
105 -class NGram:
106
107 - def __init__(self, folder, ext='.lm'):
108 self.ngrams = dict() 109 folder = os.path.join(folder, '*' + ext) 110 size = len(ext) 111 count = 0 112 113 for fname in glob.glob(os.path.normcase(folder)): 114 count += 1 115 lang = os.path.split(fname)[-1][:-size] 116 ngrams = {} 117 lines = open(fname, 'r').readlines() 118 119 try: 120 i = len(lines) 121 for line in lines: 122 line = line.decode('utf-8') 123 parts = line[:-1].split() 124 if len(parts) != 2: 125 try: 126 ngrams[parts[0]] = i 127 except IndexError: 128 # Line probably only contained spaces, if anything 129 pass 130 else: 131 ngrams[parts[0]] = int(parts[1]) 132 i -= 1 133 except UnicodeDecodeError, e: 134 continue 135 136 if ngrams: 137 self.ngrams[lang] = _NGram(ngrams) 138 139 if not count: 140 raise ValueError("no language files found")
141
142 - def classify(self, text):
143 ngram = _NGram(text) 144 r = 'guess' 145 146 langs = self.ngrams.keys() 147 r = langs.pop() 148 min = self.ngrams[r].compare(ngram) 149 150 for lang in langs: 151 d = self.ngrams[lang].compare(ngram) 152 if d < min: 153 min = d 154 r = lang 155 156 if min > 0.8 * (nb_ngrams ** 2): 157 r = '' 158 return r
159 160
161 -class Generate:
162
163 - def __init__(self, folder, ext='.txt'):
164 self.ngrams = dict() 165 folder = os.path.join(folder, '*' + ext) 166 size = len(ext) 167 168 for fname in glob.glob(os.path.normcase(folder)): 169 lang = os.path.split(fname)[-1][:-size] 170 n = _NGram() 171 172 file = open(fname, 'r') 173 for line in file.readlines(): 174 n.addText(line) 175 file.close() 176 177 n.normalise() 178 self.ngrams[lang] = n
179
180 - def save(self, folder, ext='.lm'):
181 for lang in self.ngrams.keys(): 182 fname = os.path.join(folder, lang + ext) 183 file = open(fname, 'w') 184 for v, k in self.ngrams[lang].sorted(): 185 file.write("%s\t %d\n" % (k, v)) 186 file.close()
187 188 if __name__ == '__main__': 189 import sys 190 191 # Should you want to generate your own .lm files 192 #conf = Generate('/tmp') 193 #conf.save('/tmp') 194 195 text = sys.stdin.readline() 196 from translate.misc.file_discovery import get_abs_data_filename 197 l = NGram(get_abs_data_filename('langmodels')) 198 print l.classify(text) 199