Package translate :: Package tools :: Module pogrep
[hide private]
[frames] | no frames]

Source Code for Module translate.tools.pogrep

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2002-2011 Zuza Software Foundation 
  5  # 
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22  """Grep XLIFF, Gettext PO and TMX localization files 
 23   
 24  Matches are output to snippet files of the same type which can then be reviewed 
 25  and later merged using pomerge 
 26   
 27  See: http://translate.sourceforge.net/wiki/toolkit/pogrep for examples and 
 28  usage instructions 
 29  """ 
 30   
 31  import re 
 32  import locale 
 33   
 34  from translate.storage import factory 
 35  from translate.storage.poheader import poheader 
 36  from translate.misc import optrecurse 
 37  from translate.misc.multistring import multistring 
 38  from translate.lang import data 
 39   
 40   
41 -class GrepMatch(object):
42 """Just a small data structure that represents a search match.""" 43 44 # INITIALIZERS #
45 - def __init__(self, unit, part='target', part_n=0, start=0, end=0):
46 self.unit = unit 47 self.part = part 48 self.part_n = part_n 49 self.start = start 50 self.end = end
51 52 # ACCESSORS #
53 - def get_getter(self):
54 if self.part == 'target': 55 if self.unit.hasplural(): 56 getter = lambda: self.unit.target.strings[self.part_n] 57 else: 58 getter = lambda: self.unit.target 59 return getter 60 elif self.part == 'source': 61 if self.unit.hasplural(): 62 getter = lambda: self.unit.source.strings[self.part_n] 63 else: 64 getter = lambda: self.unit.source 65 return getter 66 elif self.part == 'notes': 67 68 def getter(): 69 return self.unit.getnotes()[self.part_n]
70 return getter 71 elif self.part == 'locations': 72 73 def getter(): 74 return self.unit.getlocations()[self.part_n]
75 return getter 76
77 - def get_setter(self):
78 if self.part == 'target': 79 if self.unit.hasplural(): 80 81 def setter(value): 82 strings = self.unit.target.strings 83 strings[self.part_n] = value 84 self.unit.target = strings
85 else: 86 87 def setter(value): 88 self.unit.target = value 89 return setter 90 91 # SPECIAL METHODS #
92 - def __str__(self):
93 start, end = self.start, self.end 94 if start < 3: 95 start = 3 96 if end > len(self.get_getter()()) - 3: 97 end = len(self.get_getter()()) - 3 98 matchpart = self.get_getter()()[start-2:end+2] 99 return '<GrepMatch "%s" part=%s[%d] start=%d end=%d>' % (matchpart, self.part, self.part_n, self.start, self.end)
100
101 - def __repr__(self):
102 return str(self)
103 104
105 -def real_index(string, nfc_index):
106 """Calculate the real index in the unnormalized string that corresponds to 107 the index nfc_index in the normalized string.""" 108 length = nfc_index 109 max_length = len(string) 110 while len(data.normalize(string[:length])) <= nfc_index: 111 if length == max_length: 112 return length 113 length += 1 114 return length - 1
115 116
117 -def find_matches(unit, part, strings, re_search):
118 """Return the GrepFilter objects where re_search matches in strings.""" 119 matches = [] 120 for n, string in enumerate(strings): 121 if not string: 122 continue 123 normalized = data.normalize(string) 124 for matchobj in re_search.finditer(normalized): 125 start = real_index(string, matchobj.start()) 126 end = real_index(string, matchobj.end()) 127 matches.append(GrepMatch(unit, part=part, part_n=n, start=start, end=end)) 128 return matches
129 130
131 -class GrepFilter:
132
133 - def __init__(self, searchstring, searchparts, ignorecase=False, useregexp=False, 134 invertmatch=False, keeptranslations=False, accelchar=None, encoding='utf-8', 135 max_matches=0):
136 """builds a checkfilter using the given checker""" 137 if isinstance(searchstring, unicode): 138 self.searchstring = searchstring 139 else: 140 self.searchstring = searchstring.decode(encoding) 141 self.searchstring = data.normalize(self.searchstring) 142 if searchparts: 143 # For now we still support the old terminology, except for the old 'source' 144 # which has a new meaning now. 145 self.search_source = ('source' in searchparts) or ('msgid' in searchparts) 146 self.search_target = ('target' in searchparts) or ('msgstr' in searchparts) 147 self.search_notes = ('notes' in searchparts) or ('comment' in searchparts) 148 self.search_locations = 'locations' in searchparts 149 else: 150 self.search_source = True 151 self.search_target = True 152 self.search_notes = False 153 self.search_locations = False 154 self.ignorecase = ignorecase 155 if self.ignorecase: 156 self.searchstring = self.searchstring.lower() 157 self.useregexp = useregexp 158 if self.useregexp: 159 self.searchpattern = re.compile(self.searchstring) 160 self.invertmatch = invertmatch 161 self.keeptranslations = keeptranslations 162 self.accelchar = accelchar 163 self.max_matches = max_matches
164
165 - def matches(self, teststr):
166 if teststr is None: 167 return False 168 teststr = data.normalize(teststr) 169 if self.ignorecase: 170 teststr = teststr.lower() 171 if self.accelchar: 172 teststr = re.sub(self.accelchar + self.accelchar, "#", teststr) 173 teststr = re.sub(self.accelchar, "", teststr) 174 if self.useregexp: 175 found = self.searchpattern.search(teststr) 176 else: 177 found = teststr.find(self.searchstring) != -1 178 if self.invertmatch: 179 found = not found 180 return found
181
182 - def filterunit(self, unit):
183 """runs filters on an element""" 184 if unit.isheader(): 185 return True 186 187 if self.keeptranslations and unit.target: 188 return True 189 190 if self.search_source: 191 if isinstance(unit.source, multistring): 192 strings = unit.source.strings 193 else: 194 strings = [unit.source] 195 for string in strings: 196 if self.matches(string): 197 return True 198 199 if self.search_target: 200 if isinstance(unit.target, multistring): 201 strings = unit.target.strings 202 else: 203 strings = [unit.target] 204 for string in strings: 205 if self.matches(string): 206 return True 207 208 if self.search_notes: 209 if self.matches(unit.getnotes()): 210 return True 211 if self.search_locations: 212 if self.matches(u" ".join(unit.getlocations())): 213 return True 214 return False
215
216 - def filterfile(self, thefile):
217 """runs filters on a translation file object""" 218 thenewfile = type(thefile)() 219 thenewfile.setsourcelanguage(thefile.sourcelanguage) 220 thenewfile.settargetlanguage(thefile.targetlanguage) 221 for unit in thefile.units: 222 if self.filterunit(unit): 223 thenewfile.addunit(unit) 224 225 if isinstance(thenewfile, poheader): 226 thenewfile.updateheader(add=True, **thefile.parseheader()) 227 return thenewfile
228
229 - def getmatches(self, units):
230 if not self.searchstring: 231 return [], [] 232 233 searchstring = self.searchstring 234 flags = re.LOCALE | re.MULTILINE | re.UNICODE 235 236 if self.ignorecase: 237 flags |= re.IGNORECASE 238 if not self.useregexp: 239 searchstring = re.escape(searchstring) 240 self.re_search = re.compile(u'(%s)' % (searchstring), flags) 241 242 matches = [] 243 indexes = [] 244 245 for index, unit in enumerate(units): 246 old_length = len(matches) 247 248 if self.search_target: 249 if unit.hasplural(): 250 targets = unit.target.strings 251 else: 252 targets = [unit.target] 253 matches.extend(find_matches(unit, 'target', targets, self.re_search)) 254 if self.search_source: 255 if unit.hasplural(): 256 sources = unit.source.strings 257 else: 258 sources = [unit.source] 259 matches.extend(find_matches(unit, 'source', sources, self.re_search)) 260 if self.search_notes: 261 matches.extend(find_matches(unit, 'notes', unit.getnotes(), self.re_search)) 262 263 if self.search_locations: 264 matches.extend(find_matches(unit, 'locations', unit.getlocations(), self.re_search)) 265 266 # A search for a single letter or an all-inclusive regular 267 # expression could give enough results to cause performance 268 # problems. The answer is probably not very useful at this scale. 269 if self.max_matches and len(matches) > self.max_matches: 270 raise Exception("Too many matches found") 271 272 if len(matches) > old_length: 273 old_length = len(matches) 274 indexes.append(index) 275 276 return matches, indexes
277 278
279 -class GrepOptionParser(optrecurse.RecursiveOptionParser):
280 """a specialized Option Parser for the grep tool...""" 281
282 - def parse_args(self, args=None, values=None):
283 """parses the command line options, handling implicit input/output args""" 284 (options, args) = optrecurse.optparse.OptionParser.parse_args(self, args, values) 285 # some intelligence as to what reasonable people might give on the command line 286 if args: 287 options.searchstring = args[0] 288 args = args[1:] 289 else: 290 self.error("At least one argument must be given for the search string") 291 if args and not options.input: 292 if not options.output: 293 options.input = args[:-1] 294 args = args[-1:] 295 else: 296 options.input = args 297 args = [] 298 if args and not options.output: 299 options.output = args[-1] 300 args = args[:-1] 301 if args: 302 self.error("You have used an invalid combination of --input, --output and freestanding args") 303 if isinstance(options.input, list) and len(options.input) == 1: 304 options.input = options.input[0] 305 return (options, args)
306
307 - def set_usage(self, usage=None):
308 """sets the usage string - if usage not given, uses getusagestring for each option""" 309 if usage is None: 310 self.usage = "%prog searchstring " + " ".join([self.getusagestring(option) for option in self.option_list]) 311 else: 312 super(GrepOptionParser, self).set_usage(usage)
313
314 - def run(self):
315 """parses the arguments, and runs recursiveprocess with the resulting options""" 316 (options, args) = self.parse_args() 317 options.inputformats = self.inputformats 318 options.outputoptions = self.outputoptions 319 options.checkfilter = GrepFilter(options.searchstring, options.searchparts, options.ignorecase, options.useregexp, options.invertmatch, options.keeptranslations, options.accelchar, locale.getpreferredencoding()) 320 self.usepsyco(options) 321 self.recursiveprocess(options)
322 323
324 -def rungrep(inputfile, outputfile, templatefile, checkfilter):
325 """reads in inputfile, filters using checkfilter, writes to outputfile""" 326 fromfile = factory.getobject(inputfile) 327 tofile = checkfilter.filterfile(fromfile) 328 if tofile.isempty(): 329 return False 330 outputfile.write(str(tofile)) 331 return True
332 333
334 -def cmdlineparser():
335 formats = {"po": ("po", rungrep), "pot": ("pot", rungrep), 336 "mo": ("mo", rungrep), "gmo": ("gmo", rungrep), 337 "tmx": ("tmx", rungrep), 338 "xliff": ("xliff", rungrep), "xlf": ("xlf", rungrep), "xlff": ("xlff", rungrep), 339 None: ("po", rungrep)} 340 parser = GrepOptionParser(formats) 341 parser.add_option("", "--search", dest="searchparts", 342 action="append", type="choice", choices=["source", "target", "notes", "locations", "msgid", "msgstr", "comment"], 343 metavar="SEARCHPARTS", help="searches the given parts (source, target, notes and locations)") 344 parser.add_option("-I", "--ignore-case", dest="ignorecase", 345 action="store_true", default=False, help="ignore case distinctions") 346 parser.add_option("-e", "--regexp", dest="useregexp", 347 action="store_true", default=False, help="use regular expression matching") 348 parser.add_option("-v", "--invert-match", dest="invertmatch", 349 action="store_true", default=False, help="select non-matching lines") 350 parser.add_option("", "--accelerator", dest="accelchar", 351 action="store", type="choice", choices=["&", "_", "~"], 352 metavar="ACCELERATOR", help="ignores the given accelerator when matching") 353 parser.add_option("-k", "--keep-translations", dest="keeptranslations", 354 action="store_true", default=False, help="always extract units with translations") 355 parser.set_usage() 356 parser.passthrough.append('checkfilter') 357 parser.description = __doc__ 358 return parser
359 360
361 -def main():
362 parser = cmdlineparser() 363 parser.run()
364 365 366 if __name__ == '__main__': 367 main() 368