Package translate :: Package storage :: Module wordfast
[hide private]
[frames] | no frames]

Source Code for Module translate.storage.wordfast

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2007-2010 Zuza Software Foundation 
  5  # 
  6  # This file is part of the Translate Toolkit. 
  7  # 
  8  # This program is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # This program is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with this program; if not, see <http://www.gnu.org/licenses/>. 
 20   
 21  """Manage the Wordfast Translation Memory format 
 22   
 23     Wordfast TM format is the Translation Memory format used by the 
 24     U{Wordfast<http://www.wordfast.net/>} computer aided translation tool. 
 25   
 26     It is a bilingual base class derived format with L{WordfastTMFile} 
 27     and L{WordfastUnit} providing file and unit level access. 
 28   
 29     Wordfast tools 
 30     ============== 
 31     Wordfast is a computer aided translation tool.  It is an application 
 32     built on top of Microsoft Word and is implemented as a rather 
 33     sophisticated set of macros.  Understanding that helps us understand 
 34     many of the seemingly strange choices around this format including: 
 35     encoding, escaping and file naming. 
 36   
 37     Implementation 
 38     ============== 
 39     The implementation covers the full requirements of a Wordfast TM file. 
 40     The files are simple Tab Separated Value (TSV) files that can be read 
 41     by Microsoft Excel and other spreadsheet programs.  They use the .txt 
 42     extension which does make it more difficult to automatically identify 
 43     such files. 
 44   
 45     The dialect of the TSV files is specified by L{WordfastDialect}. 
 46   
 47     Encoding 
 48     -------- 
 49     The files are UTF-16 or ISO-8859-1 (Latin1) encoded.  These choices 
 50     are most likely because Microsoft Word is the base editing tool for 
 51     Wordfast. 
 52   
 53     The format is tab separated so we are able to detect UTF-16 vs Latin-1 
 54     by searching for the occurance of a UTF-16 tab character and then 
 55     continuing with the parsing. 
 56   
 57     Timestamps 
 58     ---------- 
 59     L{WordfastTime} allows for the correct management of the Wordfast 
 60     YYYYMMDD~HHMMSS timestamps.  However, timestamps on individual units are 
 61     not updated when edited. 
 62   
 63     Header 
 64     ------ 
 65     L{WordfastHeader} provides header management support.  The header 
 66     functionality is fully implemented through observing the behaviour of the 
 67     files in real use cases, input from the Wordfast programmers and 
 68     public documentation. 
 69   
 70     Escaping 
 71     -------- 
 72     Wordfast TM implements a form of escaping that covers two aspects: 
 73       1. Placeable: bold, formating, etc.  These are left as is and ignored. 
 74          It is up to the editor and future placeable implementation to manage 
 75          these. 
 76       2. Escapes: items that may confuse Excel or translators are 
 77          escaped as &'XX;. These are fully implemented and are converted to 
 78          and from Unicode.  By observing behaviour and reading documentation 
 79          we where able to observe all possible escapes. Unfortunately the 
 80          escaping differs slightly between Windows and Mac version.  This 
 81          might cause errors in future. 
 82     Functions allow for L{conversion to Unicode<_wf_to_char>} and L{back to 
 83     Wordfast escapes<_char_to_wf>}. 
 84   
 85     Extended Attributes 
 86     ------------------- 
 87     The last 4 columns allow users to define and manage extended attributes. 
 88     These are left as is and are not directly managed byour implemenation. 
 89  """ 
 90   
 91  import csv 
 92  import sys 
 93  import time 
 94   
 95  from translate.storage import base 
 96   
 97  WF_TIMEFORMAT = "%Y%m%d~%H%M%S" 
 98  """Time format used by Wordfast""" 
 99   
100  WF_FIELDNAMES_HEADER = ["date", "userlist", "tucount", "src-lang", "version", 
101                          "target-lang", "license", "attr1list", "attr2list", 
102                          "attr3list", "attr4list", "attr5list"] 
103  """Field names for the Wordfast header""" 
104   
105  WF_FIELDNAMES = ["date", "user", "reuse", "src-lang", "source", "target-lang", 
106                   "target", "attr1", "attr2", "attr3", "attr4"] 
107  """Field names for a Wordfast TU""" 
108   
109  WF_FIELDNAMES_HEADER_DEFAULTS = { 
110  "date": "%19000101~121212", 
111  "userlist": "%User ID,TT,TT Translate-Toolkit", 
112  "tucount": "%TU=00000001", 
113  "src-lang": "%EN-US", 
114  "version": "%Wordfast TM v.5.51w9/00", 
115  "target-lang": "", 
116  "license": "%---00000001", 
117  "attr1list": "", 
118  "attr2list": "", 
119  "attr3list": "", 
120  "attr4list": ""} 
121  """Default or minimum header entries for a Wordfast file""" 
122   
123  # TODO Needs validation.  The following need to be checked against a WF TM file 
124  # to ensure that the correct Unicode values have been chosen for the characters. 
125  # For now these look correct and have been taken from Windows CP1252 and 
126  # Macintosh code points found for the respective character sets on Linux. 
127  WF_ESCAPE_MAP = ( 
128                ("&'26;", u"\u0026"), # & - Ampersand (must be first to prevent 
129                                      #     escaping of escapes) 
130                ("&'82;", u"\u201A"), # ‚ - Single low-9 quotation mark 
131                ("&'85;", u"\u2026"), # … - Elippsis 
132                ("&'91;", u"\u2018"), # ‘ - left single quotation mark 
133                ("&'92;", u"\u2019"), # ’ - right single quotation mark 
134                ("&'93;", u"\u201C"), # “ - left double quotation mark 
135                ("&'94;", u"\u201D"), # ” - right double quotation mark 
136                ("&'96;", u"\u2013"), # – - en dash (validate) 
137                ("&'97;", u"\u2014"), # — - em dash (validate) 
138                ("&'99;", u"\u2122"), # ™ - Trade mark 
139                # Windows only 
140                ("&'A0;", u"\u00A0"), #   - Non breaking space 
141                ("&'A9;", u"\u00A9"), # © - Copyright 
142                ("&'AE;", u"\u00AE"), # ® - Registered 
143                ("&'BC;", u"\u00BC"), # ¼ 
144                ("&'BD;", u"\u00BD"), # ½ 
145                ("&'BE;", u"\u00BE"), # ¾ 
146                # Mac only 
147                ("&'A8;", u"\u00AE"), # ® - Registered 
148                ("&'AA;", u"\u2122"), # ™ - Trade mark 
149                ("&'C7;", u"\u00AB"), # « - Left-pointing double angle quotation mark 
150                ("&'C8;", u"\u00BB"), # » - Right-pointing double angle quotation mark 
151                ("&'C9;", u"\u2026"), # … - Horizontal Elippsis 
152                ("&'CA;", u"\u00A0"), #   - Non breaking space 
153                ("&'D0;", u"\u2013"), # – - en dash (validate) 
154                ("&'D1;", u"\u2014"), # — - em dash (validate) 
155                ("&'D2;", u"\u201C"), # “ - left double quotation mark 
156                ("&'D3;", u"\u201D"), # ” - right double quotation mark 
157                ("&'D4;", u"\u2018"), # ‘ - left single quotation mark 
158                ("&'D5;", u"\u2019"), # ’ - right single quotation mark 
159                ("&'E2;", u"\u201A"), # ‚ - Single low-9 quotation mark 
160                ("&'E3;", u"\u201E"), # „ - Double low-9 quotation mark 
161                # Other markers 
162                #("&'B;", u"\n"), # Soft-break - XXX creates a problem with 
163                                  # roundtripping could also be represented 
164                                  # by \u2028 
165               ) 
166  """Mapping of Wordfast &'XX; escapes to correct Unicode characters""" 
167   
168  TAB_UTF16 = "\x00\x09" 
169  """The tab \\t character as it would appear in UTF-16 encoding""" 
170   
171   
172 -def _char_to_wf(string):
173 """Char -> Wordfast &'XX; escapes 174 175 Full roundtripping is not possible because of the escaping of 176 NEWLINE \\n and TAB \\t""" 177 # FIXME there is no platform check to ensure that we use Mac encodings 178 # when running on a Mac 179 if string: 180 for code, char in WF_ESCAPE_MAP: 181 string = string.replace(char.encode('utf-8'), code) 182 string = string.replace("\n", "\\n").replace("\t", "\\t") 183 return string
184 185
186 -def _wf_to_char(string):
187 """Wordfast &'XX; escapes -> Char""" 188 if string: 189 for code, char in WF_ESCAPE_MAP: 190 string = string.replace(code, char.encode('utf-8')) 191 string = string.replace("\\n", "\n").replace("\\t", "\t") 192 return string
193 194
195 -class WordfastDialect(csv.Dialect):
196 """Describe the properties of a Wordfast generated TAB-delimited file.""" 197 delimiter = "\t" 198 lineterminator = "\r\n" 199 quoting = csv.QUOTE_NONE 200 if sys.version_info < (2, 5, 0): 201 # We need to define the following items for csv in Python < 2.5 202 quoting = csv.QUOTE_MINIMAL # Wordfast does not quote anything, since 203 # we escape \t anyway in _char_to_wf this 204 # should not be a problem 205 doublequote = False 206 skipinitialspace = False 207 escapechar = None 208 quotechar = '"'
209 csv.register_dialect("wordfast", WordfastDialect) 210 211
212 -class WordfastTime(object):
213 """Manages time stamps in the Wordfast format of YYYYMMDD~hhmmss""" 214
215 - def __init__(self, newtime=None):
216 self._time = None 217 if not newtime: 218 self.time = None 219 elif isinstance(newtime, basestring): 220 self.timestring = newtime 221 elif isinstance(newtime, time.struct_time): 222 self.time = newtime
223
224 - def get_timestring(self):
225 """Get the time in the Wordfast time format""" 226 if not self._time: 227 return None 228 else: 229 return time.strftime(WF_TIMEFORMAT, self._time)
230
231 - def set_timestring(self, timestring):
232 """Set the time_sturct object using a Wordfast time formated string 233 234 @param timestring: A Wordfast time string (YYYMMDD~hhmmss) 235 @type timestring: String 236 """ 237 self._time = time.strptime(timestring, WF_TIMEFORMAT)
238 timestring = property(get_timestring, set_timestring) 239
240 - def get_time(self):
241 """Get the time_struct object""" 242 return self._time
243
244 - def set_time(self, newtime):
245 """Set the time_struct object 246 247 @param newtime: a new time object 248 @type newtime: time.time_struct 249 """ 250 if newtime and isinstance(newtime, time.struct_time): 251 self._time = newtime 252 else: 253 self._time = None
254 time = property(get_time, set_time) 255
256 - def __str__(self):
257 if not self.timestring: 258 return "" 259 else: 260 return self.timestring
261 262
263 -class WordfastHeader(object):
264 """A wordfast translation memory header""" 265
266 - def __init__(self, header=None):
267 self._header_dict = [] 268 if not header: 269 self.header = self._create_default_header() 270 elif isinstance(header, dict): 271 self.header = header
272
273 - def _create_default_header(self):
274 """Create a default Wordfast header with the date set to the current 275 time""" 276 defaultheader = WF_FIELDNAMES_HEADER_DEFAULTS 277 defaultheader['date'] = '%%%s' % WordfastTime(time.localtime()).timestring 278 return defaultheader
279
280 - def getheader(self):
281 """Get the header dictionary""" 282 return self._header_dict
283
284 - def setheader(self, newheader):
285 self._header_dict = newheader
286 header = property(getheader, setheader) 287
288 - def settargetlang(self, newlang):
289 self._header_dict['target-lang'] = '%%%s' % newlang
290 targetlang = property(None, settargetlang) 291
292 - def settucount(self, count):
293 self._header_dict['tucount'] = '%%TU=%08d' % count
294 tucount = property(None, settucount)
295 296
297 -class WordfastUnit(base.TranslationUnit):
298 """A Wordfast translation memory unit""" 299
300 - def __init__(self, source=None):
301 self._dict = {} 302 if source: 303 self.source = source 304 super(WordfastUnit, self).__init__(source)
305
306 - def _update_timestamp(self):
307 """Refresh the timestamp for the unit""" 308 self._dict['date'] = WordfastTime(time.localtime()).timestring
309
310 - def getdict(self):
311 """Get the dictionary of values for a Wordfast line""" 312 return self._dict
313
314 - def setdict(self, newdict):
315 """Set the dictionary of values for a Wordfast line 316 317 @param newdict: a new dictionary with Wordfast line elements 318 @type newdict: Dict 319 """ 320 # TODO First check that the values are OK 321 self._dict = newdict
322 dict = property(getdict, setdict) 323
324 - def _get_source_or_target(self, key):
325 if self._dict.get(key, None) is None: 326 return None 327 elif self._dict[key]: 328 return _wf_to_char(self._dict[key]).decode('utf-8') 329 else: 330 return ""
331
332 - def _set_source_or_target(self, key, newvalue):
333 if newvalue is None: 334 self._dict[key] = None 335 if isinstance(newvalue, unicode): 336 newvalue = newvalue.encode('utf-8') 337 newvalue = _char_to_wf(newvalue) 338 if not key in self._dict or newvalue != self._dict[key]: 339 self._dict[key] = newvalue 340 self._update_timestamp()
341
342 - def getsource(self):
343 return self._get_source_or_target('source')
344
345 - def setsource(self, newsource):
346 self._rich_source = None 347 return self._set_source_or_target('source', newsource)
348 source = property(getsource, setsource) 349
350 - def gettarget(self):
351 return self._get_source_or_target('target')
352
353 - def settarget(self, newtarget):
354 self._rich_target = None 355 return self._set_source_or_target('target', newtarget)
356 target = property(gettarget, settarget) 357
358 - def settargetlang(self, newlang):
359 self._dict['target-lang'] = newlang
360 targetlang = property(None, settargetlang) 361
362 - def __str__(self):
363 return str(self._dict)
364
365 - def istranslated(self):
366 if not self._dict.get('source', None): 367 return False 368 return bool(self._dict.get('target', None))
369 370
371 -class WordfastTMFile(base.TranslationStore):
372 """A Wordfast translation memory file""" 373 Name = _("Wordfast Translation Memory") 374 Mimetypes = ["application/x-wordfast"] 375 Extensions = ["txt"] 376
377 - def __init__(self, inputfile=None, unitclass=WordfastUnit):
378 """construct a Wordfast TM, optionally reading in from inputfile.""" 379 self.UnitClass = unitclass 380 base.TranslationStore.__init__(self, unitclass=unitclass) 381 self.filename = '' 382 self.header = WordfastHeader() 383 self._encoding = 'iso-8859-1' 384 if inputfile is not None: 385 self.parse(inputfile)
386
387 - def parse(self, input):
388 """parsese the given file or file source string""" 389 if hasattr(input, 'name'): 390 self.filename = input.name 391 elif not getattr(self, 'filename', ''): 392 self.filename = '' 393 if hasattr(input, "read"): 394 tmsrc = input.read() 395 input.close() 396 input = tmsrc 397 if TAB_UTF16 in input.split("\n")[0]: 398 self._encoding = 'utf-16' 399 else: 400 self._encoding = 'iso-8859-1' 401 try: 402 input = input.decode(self._encoding).encode('utf-8') 403 except: 404 raise ValueError("Wordfast files are either UTF-16 (UCS2) or ISO-8859-1 encoded") 405 for header in csv.DictReader(input.split("\n")[:1], 406 fieldnames=WF_FIELDNAMES_HEADER, 407 dialect="wordfast"): 408 self.header = WordfastHeader(header) 409 lines = csv.DictReader(input.split("\n")[1:], 410 fieldnames=WF_FIELDNAMES, 411 dialect="wordfast") 412 for line in lines: 413 newunit = WordfastUnit() 414 newunit.dict = line 415 self.addunit(newunit)
416
417 - def __str__(self):
418 output = csv.StringIO() 419 header_output = csv.StringIO() 420 writer = csv.DictWriter(output, fieldnames=WF_FIELDNAMES, 421 dialect="wordfast") 422 unit_count = 0 423 for unit in self.units: 424 if unit.istranslated(): 425 unit_count += 1 426 writer.writerow(unit.dict) 427 if unit_count == 0: 428 return "" 429 output.reset() 430 self.header.tucount = unit_count 431 outheader = csv.DictWriter(header_output, 432 fieldnames=WF_FIELDNAMES_HEADER, 433 dialect="wordfast") 434 outheader.writerow(self.header.header) 435 header_output.reset() 436 decoded = "".join(header_output.readlines() + output.readlines()).decode('utf-8') 437 try: 438 return decoded.encode(self._encoding) 439 except UnicodeEncodeError: 440 return decoded.encode('utf-16')
441