Package translate :: Package storage :: Module csvl10n
[hide private]
[frames] | no frames]

Source Code for Module translate.storage.csvl10n

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2002-2006 Zuza Software Foundation 
  5  # 
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22  """classes that hold units of comma-separated values (.csv) files (csvunit) 
 23  or entire files (csvfile) for use with localisation 
 24  """ 
 25   
 26  import csv 
 27  import logging 
 28  import codecs 
 29  try: 
 30      import cStringIO as StringIO 
 31  except: 
 32      import StringIO 
 33   
 34  from translate.misc import sparse 
 35  from translate.storage import base 
 36   
 37   
38 -class SimpleDictReader:
39
40 - def __init__(self, fileobj, fieldnames):
41 self.fieldnames = fieldnames 42 self.contents = fileobj.read() 43 self.parser = sparse.SimpleParser(defaulttokenlist=[",", "\n"], whitespacechars="\r") 44 self.parser.stringescaping = 0 45 self.parser.quotechars = '"' 46 self.tokens = self.parser.tokenize(self.contents) 47 self.tokenpos = 0
48
49 - def __iter__(self):
50 return self
51
52 - def getvalue(self, value):
53 """returns a value, evaluating strings as neccessary""" 54 if (value.startswith("'") and value.endswith("'")) or (value.startswith('"') and value.endswith('"')): 55 return sparse.stringeval(value) 56 else: 57 return value
58
59 - def next(self):
60 lentokens = len(self.tokens) 61 while self.tokenpos < lentokens and self.tokens[self.tokenpos] == "\n": 62 self.tokenpos += 1 63 if self.tokenpos >= lentokens: 64 raise StopIteration() 65 thistokens = [] 66 while self.tokenpos < lentokens and self.tokens[self.tokenpos] != "\n": 67 thistokens.append(self.tokens[self.tokenpos]) 68 self.tokenpos += 1 69 while self.tokenpos < lentokens and self.tokens[self.tokenpos] == "\n": 70 self.tokenpos += 1 71 fields = [] 72 # patch together fields since we can have quotes inside a field 73 currentfield = '' 74 fieldparts = 0 75 for token in thistokens: 76 if token == ',': 77 # a field is only quoted if the whole thing is quoted 78 if fieldparts == 1: 79 currentfield = self.getvalue(currentfield) 80 fields.append(currentfield) 81 currentfield = '' 82 fieldparts = 0 83 else: 84 currentfield += token 85 fieldparts += 1 86 # things after the last comma... 87 if fieldparts: 88 if fieldparts == 1: 89 currentfield = self.getvalue(currentfield) 90 fields.append(currentfield) 91 values = {} 92 for fieldnum in range(len(self.fieldnames)): 93 if fieldnum >= len(fields): 94 values[self.fieldnames[fieldnum]] = "" 95 else: 96 values[self.fieldnames[fieldnum]] = fields[fieldnum] 97 return values
98
99 -class DefaultDialect(csv.excel):
100 skipinitialspace = True 101 quoting = csv.QUOTE_NONNUMERIC 102 escapechar = '\\'
103 104 csv.register_dialect('default', DefaultDialect) 105
106 -def from_unicode(text, encoding='utf-8'):
107 if encoding == 'auto': 108 encoding = 'utf-8' 109 if isinstance(text, unicode): 110 return text.encode(encoding) 111 return text
112
113 -def to_unicode(text, encoding='utf-8'):
114 if encoding == 'auto': 115 encoding = 'utf-8' 116 if isinstance(text, unicode): 117 return text 118 return text.decode(encoding)
119
120 -class csvunit(base.TranslationUnit):
121 spreadsheetescapes = [("+", "\\+"), ("-", "\\-"), ("=", "\\="), ("'", "\\'")] 122
123 - def __init__(self, source=None):
124 super(csvunit, self).__init__(source) 125 self.location = "" 126 self.source = source or "" 127 self.target = "" 128 self.id = "" 129 self.fuzzy = 'False' 130 self.developer_comments = "" 131 self.translator_comments = "" 132 self.context = ""
133
134 - def getid(self):
135 if self.id: 136 return self.id 137 138 result = self.source 139 context = self.context 140 if context: 141 result = u"%s\04%s" % (context, result) 142 143 return result
144
145 - def setid(self, value):
146 self.id = value
147
148 - def getlocations(self):
149 #FIXME: do we need to support more than one location 150 return [self.location]
151
152 - def addlocation(self, location):
153 self.location = location
154
155 - def getcontext(self):
156 return self.context
157
158 - def setcontext(self, value):
159 self.context = value
160
161 - def getnotes(self, origin=None):
162 if origin is None: 163 result = self.translator_comments 164 if self.developer_comments: 165 if result: 166 result += '\n' + self.developer_comments 167 else: 168 result = self.developer_comments 169 return result 170 elif origin == "translator": 171 return self.translator_comments 172 elif origin in ('programmer', 'developer', 'source code'): 173 return self.developer_comments 174 else: 175 raise ValueError("Comment type not valid")
176
177 - def addnote(self, text, origin=None, position="append"):
178 if origin in ('programmer', 'developer', 'source code'): 179 if position == 'append' and self.developer_comments: 180 self.developer_comments += '\n' + text 181 elif position == 'prepend' and self.developer_comments: 182 self.developer_comments = text + '\n' + self.developer_comments 183 else: 184 self.developer_comments = text 185 else: 186 if position == 'append' and self.translator_comments: 187 self.translator_comments += '\n' + text 188 elif position == 'prepend' and self.translator_comments: 189 self.translator_comments = self.translator_comments + '\n' + text 190 else: 191 self.translator_comments = text
192
193 - def removenotes(self):
194 self.translator_comments = u''
195
196 - def isfuzzy(self):
197 if self.fuzzy.lower() in ('1', 'x', 'true', 'yes','fuzzy'): 198 return True 199 return False
200
201 - def markfuzzy(self, value=True):
202 if value: 203 self.fuzzy = 'True' 204 else: 205 self.fuzzy = 'False'
206
207 - def match_header(self):
208 """see if unit might be a header""" 209 some_value = False 210 for key, value in self.todict().iteritems(): 211 if value: 212 some_value = True 213 if key.lower() != 'fuzzy' and value and key.lower() != value.lower(): 214 return False 215 return some_value
216
217 - def add_spreadsheet_escapes(self, source, target):
218 """add common spreadsheet escapes to two strings""" 219 for unescaped, escaped in self.spreadsheetescapes: 220 if source.startswith(unescaped): 221 source = source.replace(unescaped, escaped, 1) 222 if target.startswith(unescaped): 223 target = target.replace(unescaped, escaped, 1) 224 return source, target
225
226 - def remove_spreadsheet_escapes(self, source, target):
227 """remove common spreadsheet escapes from two strings""" 228 for unescaped, escaped in self.spreadsheetescapes: 229 if source.startswith(escaped): 230 source = source.replace(escaped, unescaped, 1) 231 if target.startswith(escaped): 232 target = target.replace(escaped, unescaped, 1) 233 return source, target
234
235 - def fromdict(self, cedict, encoding='utf-8'):
236 for key, value in cedict.iteritems(): 237 rkey = fieldname_map.get(key, key) 238 if value is None: 239 continue 240 value = to_unicode(value, encoding) 241 if rkey == "id": 242 self.id = value 243 elif rkey == "source": 244 self.source = value 245 elif rkey == "target": 246 self.target = value 247 elif rkey == "location": 248 self.location = value 249 elif rkey == "fuzzy": 250 self.fuzzy = value 251 elif rkey == "context": 252 self.context = value 253 elif rkey == "translator_comments": 254 self.translator_comments = value 255 elif rkey == "developer_comments": 256 self.developer_comments = value
257 258 #self.source, self.target = self.remove_spreadsheet_escapes(self.source, self.target) 259
260 - def todict(self, encoding='utf-8'):
261 #FIXME: use apis? 262 #source, target = self.add_spreadsheet_escapes(self.source, self.target) 263 source = self.source 264 target = self.target 265 output = { 266 'location': from_unicode(self.location, encoding), 267 'source': from_unicode(source, encoding), 268 'target': from_unicode(target, encoding), 269 'id': from_unicode(self.id, encoding), 270 'fuzzy': str(self.fuzzy), 271 'context': from_unicode(self.context, encoding), 272 'translator_comments': from_unicode(self.translator_comments, encoding), 273 'developer_comments': from_unicode(self.developer_comments, encoding), 274 } 275 276 return output
277
278 - def __str__(self):
279 return str(self.todict())
280 281 canonical_field_names = ('location', 'source', 'target', 'id', 'fuzzy', 'context', 'translator_comments', 'developer_comments') 282 fieldname_map = { 283 'original': 'source', 284 'untranslated': 'source', 285 'translated': 'target', 286 'translation': 'target', 287 'identified': 'id', 288 'key': 'id', 289 'label': 'id', 290 'transaltor comments': 'translator_comments', 291 'notes': 'translator_comments', 292 'developer comments': 'developer_comments', 293 'state': 'fuzzy', 294 } 295
296 -def try_dialects(inputfile, fieldnames, dialect):
297 #FIXME: does it verify at all if we don't actually step through the file? 298 try: 299 inputfile.seek(0) 300 reader = csv.DictReader(inputfile, fieldnames=fieldnames, dialect=dialect) 301 except csv.Error: 302 try: 303 inputfile.seek(0) 304 reader = csv.DictReader(inputfile, fieldnames=fieldnames, dialect='default') 305 except csv.Error: 306 inputfile.seek(0) 307 reader = csv.DictReader(inputfile, fieldnames=fieldnames, dialect='excel') 308 return reader
309
310 -def valid_fieldnames(fieldnames):
311 """check if fieldnames are valid""" 312 for fieldname in fieldnames: 313 if fieldname in canonical_field_names and fieldname == 'source': 314 return True 315 elif fieldname in fieldname_map and fieldname_map[fieldname] == 'source': 316 return True 317 return False
318
319 -def detect_header(sample, dialect, fieldnames):
320 """Test if file has a header or not, also returns number of columns in first row""" 321 inputfile = StringIO.StringIO(sample) 322 try: 323 reader = csv.reader(inputfile, dialect) 324 except csv.Error: 325 try: 326 inputfile.seek(0) 327 reader = csv.reader(inputfile, 'default') 328 except csv.Error: 329 inputfile.seek(0) 330 reader = csv.reader(inputfile, 'excel') 331 332 header = reader.next() 333 columncount = max(len(header), 3) 334 if valid_fieldnames(header): 335 return header 336 return fieldnames[:columncount]
337
338 -class csvfile(base.TranslationStore):
339 """This class represents a .csv file with various lines. 340 The default format contains three columns: location, source, target""" 341 UnitClass = csvunit 342 Name = _("Comma Separated Value") 343 Mimetypes = ['text/comma-separated-values', 'text/csv'] 344 Extensions = ["csv"] 345
346 - def __init__(self, inputfile=None, fieldnames=None, encoding="auto"):
347 base.TranslationStore.__init__(self, unitclass=self.UnitClass) 348 self.units = [] 349 self.encoding = encoding or 'utf-8' 350 if not fieldnames: 351 self.fieldnames = ['location', 'source', 'target', 'id', 'fuzzy', 'context', 'translator_comments', 'developer_comments'] 352 else: 353 if isinstance(fieldnames, basestring): 354 fieldnames = [fieldname.strip() for fieldname in fieldnames.split(",")] 355 self.fieldnames = fieldnames 356 self.filename = getattr(inputfile, 'name', '') 357 self.dialect = 'default' 358 if inputfile is not None: 359 csvsrc = inputfile.read() 360 inputfile.close() 361 self.parse(csvsrc)
362 363
364 - def parse(self, csvsrc):
365 text, encoding = self.detect_encoding(csvsrc, default_encodings=['utf-8', 'utf-16']) 366 #FIXME: raise parse error if encoding detection fails? 367 if encoding and encoding.lower() != 'utf-8': 368 csvsrc = text.encode('utf-8').lstrip(codecs.BOM_UTF8) 369 self.encoding = encoding or 'utf-8' 370 371 sniffer = csv.Sniffer() 372 # FIXME: maybe we should sniff a smaller sample 373 sample = csvsrc[:1024] 374 if isinstance(sample, unicode): 375 sample = sample.encode('utf-8') 376 377 try: 378 self.dialect = sniffer.sniff(sample) 379 if not self.dialect.escapechar: 380 self.dialect.escapechar = '\\' 381 if self.dialect.quoting == csv.QUOTE_MINIMAL: 382 #HACKISH: most probably a default, not real detection 383 self.dialect.quoting = csv.QUOTE_ALL 384 self.dialect.doublequote = True 385 except csv.Error: 386 self.dialect = 'default' 387 388 try: 389 fieldnames = detect_header(sample, self.dialect, self.fieldnames) 390 self.fieldnames = fieldnames 391 except csv.Error: 392 pass 393 394 inputfile = csv.StringIO(csvsrc) 395 reader = try_dialects(inputfile, self.fieldnames, self.dialect) 396 397 #reader = SimpleDictReader(csvfile, fieldnames=fieldnames, dialect=dialect) 398 first_row = True 399 for row in reader: 400 newce = self.UnitClass() 401 newce.fromdict(row) 402 if not first_row or not newce.match_header(): 403 self.addunit(newce) 404 first_row = False
405
406 - def __str__(self):
407 """convert to a string. double check that unicode is handled somehow here""" 408 source = self.getoutput() 409 if not isinstance(source, unicode): 410 source = source.decode('utf-8') 411 if not self.encoding or self.encoding == 'auto': 412 encoding = 'utf-8' 413 else: 414 encoding = self.encoding 415 return source.encode(encoding)
416
417 - def getoutput(self):
418 outputfile = StringIO.StringIO() 419 writer = csv.DictWriter(outputfile, self.fieldnames, extrasaction='ignore', dialect=self.dialect) 420 # write header 421 hdict = dict(map(None, self.fieldnames, self.fieldnames)) 422 writer.writerow(hdict) 423 for ce in self.units: 424 cedict = ce.todict() 425 writer.writerow(cedict) 426 return outputfile.getvalue()
427