Package logilab :: Package common :: Module textutils
[frames] | no frames]

Source Code for Module logilab.common.textutils

  1  """Some text manipulation utility functions. 
  2   
  3  :author:    Logilab 
  4  :copyright: 2003-2008 LOGILAB S.A. (Paris, FRANCE), all rights reserved. 
  5  :contact: http://www.logilab.fr/ -- mailto:contact@logilab.fr 
  6  :license: General Public License version 2 - http://www.gnu.org/licenses 
  7   
  8  :group text formatting: normalize_text, normalize_paragraph, pretty_match,\ 
  9  unquote, colorize_ansi 
 10  :group text manipulation: searchall, splitstrip 
 11  :sort: text formatting, text manipulation 
 12   
 13  :type ANSI_STYLES: dict(str) 
 14  :var ANSI_STYLES: dictionary mapping style identifier to ANSI terminal code 
 15   
 16  :type ANSI_COLORS: dict(str) 
 17  :var ANSI_COLORS: dictionary mapping color identifier to ANSI terminal code 
 18   
 19  :type ANSI_PREFIX: str 
 20  :var ANSI_PREFIX: 
 21    ANSI terminal code notifying the start of an ANSI escape sequence 
 22   
 23  :type ANSI_END: str 
 24  :var ANSI_END: 
 25    ANSI terminal code notifying the end of an ANSI escape sequence 
 26   
 27  :type ANSI_RESET: str 
 28  :var ANSI_RESET: 
 29    ANSI terminal code resetting format defined by a previous ANSI escape sequence 
 30  """ 
 31  __docformat__ = "restructuredtext en" 
 32   
 33  import sys 
 34  import re 
 35  from unicodedata import normalize as _uninormalize 
 36  try: 
 37      from os import linesep 
 38  except ImportError: 
 39      linesep = '\n' # gae 
 40   
 41  from logilab.common.deprecation import deprecated 
 42   
 43  MANUAL_UNICODE_MAP = { 
 44      u'\xa1': u'!',    # INVERTED EXCLAMATION MARK 
 45      u'\u0142': u'l',  # LATIN SMALL LETTER L WITH STROKE 
 46      u'\u2044': u'/',  # FRACTION SLASH 
 47      u'\xc6': u'AE',   # LATIN CAPITAL LETTER AE 
 48      u'\xa9': u'(c)',  # COPYRIGHT SIGN 
 49      u'\xab': u'"',    # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK 
 50      u'\xe6': u'ae',   # LATIN SMALL LETTER AE 
 51      u'\xae': u'(r)',  # REGISTERED SIGN 
 52      u'\u0153': u'oe', # LATIN SMALL LIGATURE OE 
 53      u'\u0152': u'OE', # LATIN CAPITAL LIGATURE OE 
 54      u'\xd8': u'O',    # LATIN CAPITAL LETTER O WITH STROKE 
 55      u'\xf8': u'o',    # LATIN SMALL LETTER O WITH STROKE 
 56      u'\xbb': u'"',    # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK 
 57      u'\xdf': u'ss',   # LATIN SMALL LETTER SHARP S 
 58      } 
 59   
60 -def unormalize(ustring, ignorenonascii=False):
61 """replace diacritical characters with their corresponding ascii characters 62 """ 63 res = [] 64 for letter in ustring[:]: 65 try: 66 replacement = MANUAL_UNICODE_MAP[letter] 67 except KeyError: 68 if ord(letter) >= 2**8: 69 if ignorenonascii: 70 continue 71 raise ValueError("can't deal with non-ascii based characters") 72 replacement = _uninormalize('NFD', letter)[0] 73 res.append(replacement) 74 return u''.join(res)
75
76 -def unquote(string):
77 """remove optional quotes (simple or double) from the string 78 79 :type string: str or unicode 80 :param string: an optionally quoted string 81 82 :rtype: str or unicode 83 :return: the unquoted string (or the input string if it wasn't quoted) 84 """ 85 if not string: 86 return string 87 if string[0] in '"\'': 88 string = string[1:] 89 if string[-1] in '"\'': 90 string = string[:-1] 91 return string
92 93 94 _BLANKLINES_RGX = re.compile('\r?\n\r?\n') 95 _NORM_SPACES_RGX = re.compile('\s+') 96
97 -def normalize_text(text, line_len=80, indent='', rest=False):
98 """normalize a text to display it with a maximum line size and 99 optionally arbitrary indentation. Line jumps are normalized but blank 100 lines are kept. The indentation string may be used to insert a 101 comment (#) or a quoting (>) mark for instance. 102 103 :type text: str or unicode 104 :param text: the input text to normalize 105 106 :type line_len: int 107 :param line_len: expected maximum line's length, default to 80 108 109 :type indent: str or unicode 110 :param indent: optional string to use as indentation 111 112 :rtype: str or unicode 113 :return: 114 the input text normalized to fit on lines with a maximized size 115 inferior to `line_len`, and optionally prefixed by an 116 indentation string 117 """ 118 if rest: 119 normp = normalize_rest_paragraph 120 else: 121 normp = normalize_paragraph 122 result = [] 123 for text in _BLANKLINES_RGX.split(text): 124 result.append(normp(text, line_len, indent)) 125 return ('%s%s%s' % (linesep, indent, linesep)).join(result)
126 127
128 -def normalize_paragraph(text, line_len=80, indent=''):
129 """normalize a text to display it with a maximum line size and 130 optionally arbitrary indentation. Line jumps are normalized. The 131 indentation string may be used top insert a comment mark for 132 instance. 133 134 :type text: str or unicode 135 :param text: the input text to normalize 136 137 :type line_len: int 138 :param line_len: expected maximum line's length, default to 80 139 140 :type indent: str or unicode 141 :param indent: optional string to use as indentation 142 143 :rtype: str or unicode 144 :return: 145 the input text normalized to fit on lines with a maximized size 146 inferior to `line_len`, and optionally prefixed by an 147 indentation string 148 """ 149 text = _NORM_SPACES_RGX.sub(' ', text) 150 line_len = line_len - len(indent) 151 lines = [] 152 while text: 153 aline, text = splittext(text.strip(), line_len) 154 lines.append(indent + aline) 155 return linesep.join(lines)
156
157 -def normalize_rest_paragraph(text, line_len=80, indent=''):
158 """normalize a ReST text to display it with a maximum line size and 159 optionally arbitrary indentation. Line jumps are normalized. The 160 indentation string may be used top insert a comment mark for 161 instance. 162 163 :type text: str or unicode 164 :param text: the input text to normalize 165 166 :type line_len: int 167 :param line_len: expected maximum line's length, default to 80 168 169 :type indent: str or unicode 170 :param indent: optional string to use as indentation 171 172 :rtype: str or unicode 173 :return: 174 the input text normalized to fit on lines with a maximized size 175 inferior to `line_len`, and optionally prefixed by an 176 indentation string 177 """ 178 toreport = '' 179 lines = [] 180 line_len = line_len - len(indent) 181 for line in text.splitlines(): 182 line = toreport + _NORM_SPACES_RGX.sub(' ', line.strip()) 183 toreport = '' 184 while len(line) > line_len: 185 # too long line, need split 186 line, toreport = splittext(line, line_len) 187 lines.append(indent + line) 188 if toreport: 189 line = toreport + ' ' 190 toreport = '' 191 else: 192 line = '' 193 if line: 194 lines.append(indent + line.strip()) 195 return linesep.join(lines)
196
197 -def splittext(text, line_len):
198 """split the given text on space according to the given max line size 199 200 return a 2-uple: 201 * a line <= line_len if possible 202 * the rest of the text which has to be reported on another line 203 """ 204 if len(text) <= line_len: 205 return text, '' 206 pos = min(len(text)-1, line_len) 207 while pos > 0 and text[pos] != ' ': 208 pos -= 1 209 if pos == 0: 210 pos = min(len(text), line_len) 211 while len(text) > pos and text[pos] != ' ': 212 pos += 1 213 return text[:pos], text[pos+1:].strip()
214 215
216 -def splitstrip(string, sep=','):
217 """return a list of stripped string by splitting the string given as 218 argument on `sep` (',' by default). Empty string are discarded. 219 220 >>> splitstrip('a, b, c , 4,,') 221 ['a', 'b', 'c', '4'] 222 >>> splitstrip('a') 223 ['a'] 224 >>> 225 226 :type string: str or unicode 227 :param string: a csv line 228 229 :type sep: str or unicode 230 :param sep: field separator, default to the comma (',') 231 232 :rtype: str or unicode 233 :return: the unquoted string (or the input string if it wasn't quoted) 234 """ 235 return [word.strip() for word in string.split(sep) if word.strip()]
236 237 get_csv = deprecated()(splitstrip) 238 239 _BLANK_URE = r'(\s|,)+' 240 _BLANK_RE = re.compile(_BLANK_URE) 241 __VALUE_URE = r'-?(([0-9]+\.[0-9]*)|((0x?)?[0-9]+))' 242 __UNITS_URE = r'[a-zA-Z]+' 243 _VALUE_RE = re.compile(r'(?P<value>%s)(?P<unit>%s)?'%(__VALUE_URE,__UNITS_URE)) 244 245 BYTE_UNITS = { 246 "B": 1, 247 "KB": 1024, 248 "MB": 1024 ** 2, 249 "GB": 1024 ** 3, 250 "TB": 1024 ** 4, 251 } 252 253 TIME_UNITS = { 254 "ms": 0.0001, 255 "s": 1, 256 "min": 60, 257 "h": 60 * 60, 258 "d": 60 * 60 *24, 259 } 260
261 -def apply_units( string, units, inter=None, final=float, blank_reg=_BLANK_RE, 262 value_reg=_VALUE_RE):
263 """Parse the string applying the units defined in units 264 (e.g.: "1.5m",{'m',60} -> 80). 265 266 :type string: str or unicode 267 :param string: the string to parse 268 269 :type units: dict (or any object with __getitem__ using basestring key) 270 :param units: a dict mapping a unit string repr to its value 271 272 :type inter: type 273 :param inter: used to parse every intermediate value (need __sum__) 274 275 :type blank_reg: regexp 276 :param blank_reg: should match every blank char to ignore. 277 278 :type value_reg: regexp with "value" and optional "unit" group 279 :param value_reg: match a value and it's unit into the 280 """ 281 if inter is None: 282 inter = final 283 string = _BLANK_RE.sub('',string) 284 values = [] 285 for match in value_reg.finditer(string): 286 dic = match.groupdict() 287 #import sys 288 #print >> sys.stderr, dic 289 lit, unit = dic["value"], dic.get("unit") 290 value = inter(lit) 291 if unit is not None: 292 value *= units[unit] 293 values.append(value) 294 return final(sum(values))
295 296 _LINE_RGX = re.compile('\r\n|\r+|\n') 297
298 -def pretty_match(match, string, underline_char='^'):
299 """return a string with the match location underlined: 300 301 >>> import re 302 >>> print pretty_match(re.search('mange', 'il mange du bacon'), 'il mange du bacon') 303 il mange du bacon 304 ^^^^^ 305 >>> 306 307 :type match: _sre.SRE_match 308 :param match: object returned by re.match, re.search or re.finditer 309 310 :type string: str or unicode 311 :param string: 312 the string on which the regular expression has been applied to 313 obtain the `match` object 314 315 :type underline_char: str or unicode 316 :param underline_char: 317 character to use to underline the matched section, default to the 318 carret '^' 319 320 :rtype: str or unicode 321 :return: 322 the original string with an inserted line to underline the match 323 location 324 """ 325 start = match.start() 326 end = match.end() 327 string = _LINE_RGX.sub(linesep, string) 328 start_line_pos = string.rfind(linesep, 0, start) 329 if start_line_pos == -1: 330 start_line_pos = 0 331 result = [] 332 else: 333 result = [string[:start_line_pos]] 334 start_line_pos += len(linesep) 335 offset = start - start_line_pos 336 underline = ' ' * offset + underline_char * (end - start) 337 end_line_pos = string.find(linesep, end) 338 if end_line_pos == -1: 339 string = string[start_line_pos:] 340 result.append(string) 341 result.append(underline) 342 else: 343 end = string[end_line_pos + len(linesep):] 344 string = string[start_line_pos:end_line_pos] 345 result.append(string) 346 result.append(underline) 347 result.append(end) 348 return linesep.join(result).rstrip()
349 350 351 # Ansi colorization ########################################################### 352 353 ANSI_PREFIX = '\033[' 354 ANSI_END = 'm' 355 ANSI_RESET = '\033[0m' 356 ANSI_STYLES = { 357 'reset' : "0", 358 'bold' : "1", 359 'italic' : "3", 360 'underline' : "4", 361 'blink' : "5", 362 'inverse' : "7", 363 'strike' : "9", 364 } 365 ANSI_COLORS = { 366 'reset' : "0", 367 'black' : "30", 368 'red' : "31", 369 'green' : "32", 370 'yellow' : "33", 371 'blue' : "34", 372 'magenta' : "35", 373 'cyan' : "36", 374 'white' : "37", 375 } 376 377
378 -def _get_ansi_code(color=None, style=None):
379 """return ansi escape code corresponding to color and style 380 381 :type color: str or None 382 :param color: 383 the color identifier (see `ANSI_COLORS` for available values) 384 385 :type style: str or None 386 :param style: 387 style string (see `ANSI_COLORS` for available values). To get 388 several style effects at the same time, use a coma as separator. 389 390 :raise KeyError: if an unexistent color or style identifier is given 391 392 :rtype: str 393 :return: the built escape code 394 """ 395 ansi_code = [] 396 if style: 397 style_attrs = splitstrip(style) 398 for effect in style_attrs: 399 ansi_code.append(ANSI_STYLES[effect]) 400 if color: 401 ansi_code.append(ANSI_COLORS[color]) 402 if ansi_code: 403 return ANSI_PREFIX + ';'.join(ansi_code) + ANSI_END 404 return ''
405
406 -def colorize_ansi(msg, color=None, style=None):
407 """colorize message by wrapping it with ansi escape codes 408 409 :type msg: str or unicode 410 :param msg: the message string to colorize 411 412 :type color: str or None 413 :param color: 414 the color identifier (see `ANSI_COLORS` for available values) 415 416 :type style: str or None 417 :param style: 418 style string (see `ANSI_COLORS` for available values). To get 419 several style effects at the same time, use a coma as separator. 420 421 :raise KeyError: if an unexistent color or style identifier is given 422 423 :rtype: str or unicode 424 :return: the ansi escaped string 425 """ 426 # If both color and style are not defined, then leave the text as is 427 if color is None and style is None: 428 return msg 429 escape_code = _get_ansi_code(color, style) 430 # If invalid (or unknown) color, don't wrap msg with ansi codes 431 if escape_code: 432 return '%s%s%s' % (escape_code, msg, ANSI_RESET) 433 return msg
434 435 DIFF_STYLE = {'separator': 'cyan', 'remove': 'red', 'add': 'green'} 436
437 -def diff_colorize_ansi(lines, out=sys.stdout, style=DIFF_STYLE):
438 for line in lines: 439 if line[:4] in ('--- ', '+++ '): 440 out.write(colorize_ansi(line, style['separator'])) 441 elif line[0] == '-': 442 out.write(colorize_ansi(line, style['remove'])) 443 elif line[0] == '+': 444 out.write(colorize_ansi(line, style['add'])) 445 elif line[:4] == '--- ': 446 out.write(colorize_ansi(line, style['separator'])) 447 elif line[:4] == '+++ ': 448 out.write(colorize_ansi(line, style['separator'])) 449 else: 450 out.write(line)
451