Package translate :: Package lang :: Module data
[hide private]
[frames] | no frames]

Source Code for Module translate.lang.data

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2007-2009 Zuza Software Foundation 
  5  # 
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22  """This module stores information and functionality that relates to plurals.""" 
 23   
 24  import unicodedata 
 25   
 26  from translate.storage.placeables import StringElem 
 27   
 28   
 29  languages = { 
 30  'af': (u'Afrikaans', 2, '(n != 1)'), 
 31  'ak': (u'Akan', 2, 'n > 1'), 
 32  'am': (u'Amharic', 2, 'n > 1'), 
 33  'an': (u'Aragonese', 2, '(n != 1)'), 
 34  'ar': (u'Arabic', 6, 'n==0 ? 0 : n==1 ? 1 : n==2 ? 2 : n%100>=3 && n%100<=10 ? 3 : n%100>=11 ? 4 : 5'), 
 35  'arn': (u'Mapudungun; Mapuche', 2, 'n > 1'), 
 36  'ast': (u'Asturian; Bable; Leonese; Asturleonese', 2, '(n != 1)'), 
 37  'az': (u'Azerbaijani', 2, '(n != 1)'), 
 38  'be': (u'Belarusian', 3, 'n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2'), 
 39  'bg': (u'Bulgarian', 2, '(n != 1)'), 
 40  'bn': (u'Bengali', 2, '(n != 1)'), 
 41  'bn_IN': (u'Bengali (India)', 2, '(n != 1)'), 
 42  'bo': (u'Tibetan', 1, '0'), 
 43  'br': (u'Breton', 2, 'n > 1'), 
 44  'bs': (u'Bosnian', 3, 'n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2'), 
 45  'ca': (u'Catalan; Valencian', 2, '(n != 1)'), 
 46  'ca@valencia': (u'Catalan; Valencian (Valencia)', 2, '(n != 1)'), 
 47  'cs': (u'Czech', 3, '(n==1) ? 0 : (n>=2 && n<=4) ? 1 : 2'), 
 48  'csb': (u'Kashubian', 3, 'n==1 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2'), 
 49  'cy': (u'Welsh', 2, '(n==2) ? 1 : 0'), 
 50  'da': (u'Danish', 2, '(n != 1)'), 
 51  'de': (u'German', 2, '(n != 1)'), 
 52  'dz': (u'Dzongkha', 1, '0'), 
 53  'el': (u'Greek, Modern (1453-)', 2, '(n != 1)'), 
 54  'en': (u'English', 2, '(n != 1)'), 
 55  'en_GB': (u'English (United Kingdom)', 2, '(n != 1)'), 
 56  'en_ZA': (u'English (South Africa)', 2, '(n != 1)'), 
 57  'eo': (u'Esperanto', 2, '(n != 1)'), 
 58  'es': (u'Spanish; Castilian', 2, '(n != 1)'), 
 59  'et': (u'Estonian', 2, '(n != 1)'), 
 60  'eu': (u'Basque', 2, '(n != 1)'), 
 61  'fa': (u'Persian', 1, '0'), 
 62  'fi': (u'Finnish', 2, '(n != 1)'), 
 63  'fil': (u'Filipino; Pilipino', 2, '(n > 1)'), 
 64  'fo': (u'Faroese', 2, '(n != 1)'), 
 65  'fr': (u'French', 2, '(n > 1)'), 
 66  'fur': (u'Friulian', 2, '(n != 1)'), 
 67  'fy': (u'Frisian', 2, '(n != 1)'), 
 68  'ga': (u'Irish', 3, 'n==1 ? 0 : n==2 ? 1 : 2'), 
 69  'gl': (u'Galician', 2, '(n != 1)'), 
 70  'gu': (u'Gujarati', 2, '(n != 1)'), 
 71  'gun': (u'Gun', 2, '(n > 1)'), 
 72  'ha': (u'Hausa', 2, '(n != 1)'), 
 73  'he': (u'Hebrew', 2, '(n != 1)'), 
 74  'hi': (u'Hindi', 2, '(n != 1)'), 
 75  'hy': (u'Armenian', 1, '0'), 
 76  'hr': (u'Croatian', 3, '(n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2)'), 
 77  'hu': (u'Hungarian', 2, '(n != 1)'), 
 78  'ia': (u"Interlingua (International Auxiliary Language Association)", 2, '(n != 1)'), 
 79  'id': (u'Indonesian', 1, '0'), 
 80  'is': (u'Icelandic', 2, '(n != 1)'), 
 81  'it': (u'Italian', 2, '(n != 1)'), 
 82  'ja': (u'Japanese', 1, '0'), 
 83  'jv': (u'Javanese', 2, '(n != 1)'), 
 84  'ka': (u'Georgian', 1, '0'), 
 85  'kk': (u'Kazakh', 1, '0'), 
 86  'km': (u'Central Khmer', 1, '0'), 
 87  'kn': (u'Kannada', 2, '(n != 1)'), 
 88  'ko': (u'Korean', 1, '0'), 
 89  'ku': (u'Kurdish', 2, '(n != 1)'), 
 90  'kw': (u'Cornish', 4, '(n==1) ? 0 : (n==2) ? 1 : (n == 3) ? 2 : 3'), 
 91  'ky': (u'Kirghiz; Kyrgyz', 1, '0'), 
 92  'lb': (u'Luxembourgish; Letzeburgesch', 2, '(n != 1)'), 
 93  'ln': (u'Lingala', 2, '(n > 1)'), 
 94  'lo': (u'Lao', 1, '0'), 
 95  'lt': (u'Lithuanian', 3, '(n%10==1 && n%100!=11 ? 0 : n%10>=2 && (n%100<10 || n%100>=20) ? 1 : 2)'), 
 96  'lv': (u'Latvian', 3, '(n%10==1 && n%100!=11 ? 0 : n != 0 ? 1 : 2)'), 
 97  'mg': (u'Malagasy', 2, '(n > 1)'), 
 98  'mi': (u'Maori', 2, '(n > 1)'), 
 99  'mk': (u'Macedonian', 2, 'n==1 || n%10==1 ? 0 : 1'), 
100  'ml': (u'Malayalam', 2, '(n != 1)'), 
101  'mn': (u'Mongolian', 2, '(n != 1)'), 
102  'mr': (u'Marathi', 2, '(n != 1)'), 
103  'ms': (u'Malay', 1, '0'), 
104  'mt': (u'Maltese', 4, '(n==1 ? 0 : n==0 || ( n%100>1 && n%100<11) ? 1 : (n%100>10 && n%100<20 ) ? 2 : 3)'), 
105  'nah': (u'Nahuatl languages', 2, '(n != 1)'), 
106  'nap': (u'Neapolitan', 2, '(n != 1)'), 
107  'nb': (u'Bokmål, Norwegian; Norwegian Bokmål', 2, '(n != 1)'), 
108  'ne': (u'Nepali', 2, '(n != 1)'), 
109  'nl': (u'Dutch; Flemish', 2, '(n != 1)'), 
110  'nn': (u'Norwegian Nynorsk; Nynorsk, Norwegian', 2, '(n != 1)'), 
111  'nso': (u'Pedi; Sepedi; Northern Sotho', 2, '(n > 1)'), 
112  'oc': (u'Occitan (post 1500)', 2, '(n > 1)'), 
113  'or': (u'Oriya', 2, '(n != 1)'), 
114  'pa': (u'Panjabi; Punjabi', 2, '(n != 1)'), 
115  'pap': (u'Papiamento', 2, '(n != 1)'), 
116  'pl': (u'Polish', 3, '(n==1 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2)'), 
117  'pms': (u'Piemontese', 2, '(n != 1)'), 
118  'ps': (u'Pushto; Pashto', 2, '(n != 1)'), 
119  'pt': (u'Portuguese', 2, '(n != 1)'), 
120  'pt_BR': (u'Portuguese (Brazil)', 2, '(n > 1)'), 
121  'rm': (u'Romansh', 2, '(n != 1)'), 
122  'ro': (u'Romanian', 3, '(n==1 ? 0 : (n==0 || (n%100 > 0 && n%100 < 20)) ? 1 : 2);'), 
123  'ru': (u'Russian', 3, '(n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2)'), 
124  'sco': (u'Scots', 2, '(n != 1)'), 
125  'si': (u'Sinhala; Sinhalese', 2, '(n != 1)'), 
126  'sk': (u'Slovak', 3, '(n==1) ? 0 : (n>=2 && n<=4) ? 1 : 2'), 
127  'sl': (u'Slovenian', 4, '(n%100==1 ? 0 : n%100==2 ? 1 : n%100==3 || n%100==4 ? 2 : 3)'), 
128  'so': (u'Somali', 2, '(n != 1)'), 
129  'sq': (u'Albanian', 2, '(n != 1)'), 
130  'sr': (u'Serbian', 3, '(n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2)'), 
131  'st': (u'Sotho, Southern', 2, '(n != 1)'), 
132  'su': (u'Sundanese', 1, '0'), 
133  'sv': (u'Swedish', 2, '(n != 1)'), 
134  'sw': (u'Swahili', 2, '(n != 1)'), 
135  'ta': (u'Tamil', 2, '(n != 1)'), 
136  'te': (u'Telugu', 2, '(n != 1)'), 
137  'tg': (u'Tajik', 2, '(n != 1)'), 
138  'ti': (u'Tigrinya', 2, '(n > 1)'), 
139  'th': (u'Thai', 1, '0'), 
140  'tk': (u'Turkmen', 2, '(n != 1)'), 
141  'tr': (u'Turkish', 1, '0'), 
142  'tt': (u'Tatar', 1, '0'), 
143  'ug': (u'Uighur; Uyghur', 1, '0'), 
144  'uk': (u'Ukrainian', 3, '(n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2)'), 
145  'vi': (u'Vietnamese', 1, '0'), 
146  'wa': (u'Walloon', 2, '(n > 1)'), 
147  # Chinese is difficult because the main divide is on script, not really 
148  # country. Simplified Chinese is used mostly in China, Singapore and Malaysia. 
149  # Traditional Chinese is used mostly in Hong Kong, Taiwan and Macau. 
150  'zh_CN': (u'Chinese (China)', 1, '0'), 
151  'zh_HK': (u'Chinese (Hong Kong)', 1, '0'), 
152  'zh_TW': (u'Chinese (Taiwan)', 1, '0'), 
153  'zu': (u'Zulu', 2, '(n != 1)'), 
154  } 
155  """Dictionary of language data. 
156  The language code is the dictionary key (which may contain country codes and modifiers). 
157  The value is a tuple: (Full name in English from iso-codes, nplurals, plural equation). 
158   
159  Note that the English names should not be used in user facing places - it 
160  should always be passed through the function returned from tr_lang(), or at 
161  least passed through _fix_language_name().""" 
162   
163  _fixed_names = { 
164          u"Asturian; Bable; Leonese; Asturleonese": u"Asturian", 
165          u"Bokmål, Norwegian; Norwegian Bokmål": u"Norwegian Bokmål", 
166          u"Catalan; Valencian": u"Catalan", 
167          u"Central Khmer": u"Khmer", 
168          u"Chichewa; Chewa; Nyanja": u"Chewa; Nyanja", 
169          u"Divehi; Dhivehi; Maldivian": u"Divehi", 
170          u"Dutch; Flemish": u"Dutch", 
171          u"Filipino; Pilipino": u"Filipino", 
172          u"Greek, Modern (1453-)": u"Greek", 
173          u"Interlingua (International Auxiliary Language Association)": u"Interlingua", 
174          u"Kirghiz; Kyrgyz": u"Kirghiz", 
175          u"Klingon; tlhIngan-Hol": u"Klingon", 
176          u"Limburgan; Limburger; Limburgish": u"Limburgish", 
177          u"Low German; Low Saxon; German, Low; Saxon, Low": u"Low German", 
178          u"Luxembourgish; Letzeburgesch": u"Luxembourgish", 
179          u"Ndebele, South; South Ndebele": u"Southern Ndebele", 
180          u"Norwegian Nynorsk; Nynorsk, Norwegian": u"Norwegian Nynorsk", 
181          u"Occitan (post 1500)": u"Occitan", 
182          u"Panjabi; Punjabi": u"Punjabi", 
183          u"Pedi; Sepedi; Northern Sotho": u"Northern Sotho", 
184          u"Pushto; Pashto": u"Pashto", 
185          u"Sinhala; Sinhalese": u"Sinhala", 
186          u"Sotho, Southern": u"Sotho", 
187          u"Spanish; Castilian": u"Spanish", 
188          u"Uighur; Uyghur": u"Uighur", 
189  } 
190   
191   
192 -def simplercode(code):
193 """This attempts to simplify the given language code by ignoring country 194 codes, for example. 195 196 @see: 197 - U{http://www.rfc-editor.org/rfc/bcp/bcp47.txt} 198 - U{http://www.rfc-editor.org/rfc/rfc4646.txt} 199 - U{http://www.rfc-editor.org/rfc/rfc4647.txt} 200 - U{http://www.w3.org/International/articles/language-tags/} 201 """ 202 if not code: 203 return code 204 205 normalized = normalize_code(code) 206 separator = normalized.rfind('-') 207 if separator >= 0: 208 return code[:separator] 209 else: 210 return ""
211 212 213 expansion_factors = { 214 'af': 0.1, 215 'ar': -0.09, 216 'es': 0.21, 217 'fr': 0.28, 218 'it': 0.2, 219 } 220 """Source to target string length expansion factors.""" 221 222 import gettext 223 import locale 224 import re 225 import os 226 227 iso639 = {} 228 """ISO 639 language codes""" 229 iso3166 = {} 230 """ISO 3166 country codes""" 231 232 langcode_re = re.compile("^[a-z]{2,3}([_-][A-Z]{2,3}|)(@[a-zA-Z0-9]+|)$") 233 langcode_ire = re.compile("^[a-z]{2,3}([_-][a-z]{2,3})?(@[a-z0-9]+)?$", re.IGNORECASE) 234 variant_re = re.compile("^[_-][A-Z]{2,3}(@[a-zA-Z0-9]+|)$") 235 236
237 -def languagematch(languagecode, otherlanguagecode):
238 """matches a languagecode to another, ignoring regions in the second""" 239 if languagecode is None: 240 return langcode_re.match(otherlanguagecode) 241 return languagecode == otherlanguagecode or \ 242 (otherlanguagecode.startswith(languagecode) and variant_re.match(otherlanguagecode[len(languagecode):]))
243 244 dialect_name_re = re.compile(r"(.+)\s\(([^)\d]{,25})\)$") 245 # The limit of 25 characters on the country name is so that "Interlingua (...)" 246 # (see above) is correctly interpreted. 247 248
249 -def tr_lang(langcode=None):
250 """Gives a function that can translate a language name, even in the form C{"language (country)"}, 251 into the language with iso code langcode, or the system language if no language is specified.""" 252 langfunc = gettext_lang(langcode) 253 countryfunc = gettext_country(langcode) 254 255 def handlelanguage(name): 256 match = dialect_name_re.match(name) 257 if match: 258 language, country = match.groups() 259 return u"%s (%s)" % (_fix_language_name(langfunc(language)), countryfunc(country)) 260 else: 261 return _fix_language_name(langfunc(name))
262 263 return handlelanguage 264 265
266 -def _fix_language_name(name):
267 """Identify and replace some unsightly names present in iso-codes. 268 269 If the name is present in _fixed_names we assume it is untranslated and 270 we replace it with a more usable rendering. If the remaining part is long 271 and includes a semi-colon, we only take the text up to the semi-colon to 272 keep things neat.""" 273 if name in _fixed_names: 274 return _fixed_names[name] 275 elif len(name) > 11: 276 # These constants are somewhat arbitrary, but testing with the Japanese 277 # translation of ISO codes suggests these as the upper bounds. 278 split_point = name[5:].find(u';') 279 if split_point >= 0: 280 return name[:5+split_point] 281 return name
282 283
284 -def gettext_lang(langcode=None):
285 """Returns a gettext function to translate language names into the given 286 language, or the system language if no language is specified.""" 287 if not langcode in iso639: 288 if not langcode: 289 langcode = "" 290 if os.name == "nt": 291 # On Windows the default locale is not used for some reason 292 t = gettext.translation('iso_639', languages=[locale.getdefaultlocale()[0]], fallback=True) 293 else: 294 t = gettext.translation('iso_639', fallback=True) 295 else: 296 t = gettext.translation('iso_639', languages=[langcode], fallback=True) 297 iso639[langcode] = t.ugettext 298 return iso639[langcode]
299 300
301 -def gettext_country(langcode=None):
302 """Returns a gettext function to translate country names into the given 303 language, or the system language if no language is specified.""" 304 if not langcode in iso3166: 305 if not langcode: 306 langcode = "" 307 if os.name == "nt": 308 # On Windows the default locale is not used for some reason 309 t = gettext.translation('iso_3166', languages=[locale.getdefaultlocale()[0]], fallback=True) 310 else: 311 t = gettext.translation('iso_3166', fallback=True) 312 else: 313 t = gettext.translation('iso_3166', languages=[langcode], fallback=True) 314 iso3166[langcode] = t.ugettext 315 return iso3166[langcode]
316 317
318 -def normalize(string, normal_form="NFC"):
319 """Return a unicode string in its normalized form 320 321 @param string: The string to be normalized 322 @param normal_form: NFC (default), NFD, NFKC, NFKD 323 @return: Normalized string 324 """ 325 if string is None: 326 return None 327 else: 328 return unicodedata.normalize(normal_form, string)
329 330
331 -def forceunicode(string):
332 """Ensures that the string is in unicode. 333 334 @param string: A text string 335 @type string: Unicode, String 336 @return: String converted to Unicode and normalized as needed. 337 @rtype: Unicode 338 """ 339 if string is None: 340 return None 341 if isinstance(string, str): 342 encoding = getattr(string, "encoding", "utf-8") 343 string = string.decode(encoding) 344 elif isinstance(string, StringElem): 345 string = unicode(string) 346 return string
347 348
349 -def normalized_unicode(string):
350 """Forces the string to unicode and does normalization.""" 351 return normalize(forceunicode(string))
352 353
354 -def normalize_code(code):
355 if not code: 356 return code 357 return code.replace("_", "-").replace("@", "-").lower()
358 359
360 -def simplify_to_common(language_code, languages=languages):
361 """Simplify language code to the most commonly used form for the 362 language, stripping country information for languages that tend 363 not to be localized differently for different countries""" 364 simpler = simplercode(language_code) 365 if normalize_code(language_code) in [normalize_code(key) for key in languages.keys()] or simpler == "": 366 return language_code 367 else: 368 return simplify_to_common(simpler)
369