Package logilab :: Package common :: Module textutils
[frames] | no frames]

Source Code for Module logilab.common.textutils

  1  # copyright 2003-2011 LOGILAB S.A. (Paris, FRANCE), all rights reserved. 
  2  # contact http://www.logilab.fr/ -- mailto:contact@logilab.fr 
  3  # 
  4  # This file is part of logilab-common. 
  5  # 
  6  # logilab-common is free software: you can redistribute it and/or modify it under 
  7  # the terms of the GNU Lesser General Public License as published by the Free 
  8  # Software Foundation, either version 2.1 of the License, or (at your option) any 
  9  # later version. 
 10  # 
 11  # logilab-common is distributed in the hope that it will be useful, but WITHOUT 
 12  # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 
 13  # FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more 
 14  # details. 
 15  # 
 16  # You should have received a copy of the GNU Lesser General Public License along 
 17  # with logilab-common.  If not, see <http://www.gnu.org/licenses/>. 
 18  """Some text manipulation utility functions. 
 19   
 20   
 21  :group text formatting: normalize_text, normalize_paragraph, pretty_match,\ 
 22  unquote, colorize_ansi 
 23  :group text manipulation: searchall, splitstrip 
 24  :sort: text formatting, text manipulation 
 25   
 26  :type ANSI_STYLES: dict(str) 
 27  :var ANSI_STYLES: dictionary mapping style identifier to ANSI terminal code 
 28   
 29  :type ANSI_COLORS: dict(str) 
 30  :var ANSI_COLORS: dictionary mapping color identifier to ANSI terminal code 
 31   
 32  :type ANSI_PREFIX: str 
 33  :var ANSI_PREFIX: 
 34    ANSI terminal code notifying the start of an ANSI escape sequence 
 35   
 36  :type ANSI_END: str 
 37  :var ANSI_END: 
 38    ANSI terminal code notifying the end of an ANSI escape sequence 
 39   
 40  :type ANSI_RESET: str 
 41  :var ANSI_RESET: 
 42    ANSI terminal code resetting format defined by a previous ANSI escape sequence 
 43  """ 
 44  __docformat__ = "restructuredtext en" 
 45   
 46  import sys 
 47  import re 
 48  import os.path as osp 
 49  from warnings import warn 
 50  from unicodedata import normalize as _uninormalize 
 51  try: 
 52      from os import linesep 
 53  except ImportError: 
 54      linesep = '\n' # gae 
 55   
 56  from logilab.common.deprecation import deprecated 
 57   
 58  MANUAL_UNICODE_MAP = { 
 59      u'\xa1': u'!',    # INVERTED EXCLAMATION MARK 
 60      u'\u0142': u'l',  # LATIN SMALL LETTER L WITH STROKE 
 61      u'\u2044': u'/',  # FRACTION SLASH 
 62      u'\xc6': u'AE',   # LATIN CAPITAL LETTER AE 
 63      u'\xa9': u'(c)',  # COPYRIGHT SIGN 
 64      u'\xab': u'"',    # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK 
 65      u'\xe6': u'ae',   # LATIN SMALL LETTER AE 
 66      u'\xae': u'(r)',  # REGISTERED SIGN 
 67      u'\u0153': u'oe', # LATIN SMALL LIGATURE OE 
 68      u'\u0152': u'OE', # LATIN CAPITAL LIGATURE OE 
 69      u'\xd8': u'O',    # LATIN CAPITAL LETTER O WITH STROKE 
 70      u'\xf8': u'o',    # LATIN SMALL LETTER O WITH STROKE 
 71      u'\xbb': u'"',    # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK 
 72      u'\xdf': u'ss',   # LATIN SMALL LETTER SHARP S 
 73      } 
 74   
75 -def unormalize(ustring, ignorenonascii=None, substitute=None):
76 """replace diacritical characters with their corresponding ascii characters 77 78 Convert the unicode string to its long normalized form (unicode character 79 will be transform into several characters) and keep the first one only. 80 The normal form KD (NFKD) will apply the compatibility decomposition, i.e. 81 replace all compatibility characters with their equivalents. 82 83 :type substitute: str 84 :param substitute: replacement character to use if decomposition fails 85 86 :see: Another project about ASCII transliterations of Unicode text 87 http://pypi.python.org/pypi/Unidecode 88 """ 89 # backward compatibility, ignorenonascii was a boolean 90 if ignorenonascii is not None: 91 warn("ignorenonascii is deprecated, use substitute named parameter instead", 92 DeprecationWarning, stacklevel=2) 93 if ignorenonascii: 94 substitute = '' 95 res = [] 96 for letter in ustring[:]: 97 try: 98 replacement = MANUAL_UNICODE_MAP[letter] 99 except KeyError: 100 replacement = _uninormalize('NFKD', letter)[0] 101 if ord(replacement) >= 2 ** 7: 102 if substitute is None: 103 raise ValueError("can't deal with non-ascii based characters") 104 replacement = substitute 105 res.append(replacement) 106 return u''.join(res)
107
108 -def unquote(string):
109 """remove optional quotes (simple or double) from the string 110 111 :type string: str or unicode 112 :param string: an optionally quoted string 113 114 :rtype: str or unicode 115 :return: the unquoted string (or the input string if it wasn't quoted) 116 """ 117 if not string: 118 return string 119 if string[0] in '"\'': 120 string = string[1:] 121 if string[-1] in '"\'': 122 string = string[:-1] 123 return string
124 125 126 _BLANKLINES_RGX = re.compile('\r?\n\r?\n') 127 _NORM_SPACES_RGX = re.compile('\s+') 128
129 -def normalize_text(text, line_len=80, indent='', rest=False):
130 """normalize a text to display it with a maximum line size and 131 optionally arbitrary indentation. Line jumps are normalized but blank 132 lines are kept. The indentation string may be used to insert a 133 comment (#) or a quoting (>) mark for instance. 134 135 :type text: str or unicode 136 :param text: the input text to normalize 137 138 :type line_len: int 139 :param line_len: expected maximum line's length, default to 80 140 141 :type indent: str or unicode 142 :param indent: optional string to use as indentation 143 144 :rtype: str or unicode 145 :return: 146 the input text normalized to fit on lines with a maximized size 147 inferior to `line_len`, and optionally prefixed by an 148 indentation string 149 """ 150 if rest: 151 normp = normalize_rest_paragraph 152 else: 153 normp = normalize_paragraph 154 result = [] 155 for text in _BLANKLINES_RGX.split(text): 156 result.append(normp(text, line_len, indent)) 157 return ('%s%s%s' % (linesep, indent, linesep)).join(result)
158 159
160 -def normalize_paragraph(text, line_len=80, indent=''):
161 """normalize a text to display it with a maximum line size and 162 optionally arbitrary indentation. Line jumps are normalized. The 163 indentation string may be used top insert a comment mark for 164 instance. 165 166 :type text: str or unicode 167 :param text: the input text to normalize 168 169 :type line_len: int 170 :param line_len: expected maximum line's length, default to 80 171 172 :type indent: str or unicode 173 :param indent: optional string to use as indentation 174 175 :rtype: str or unicode 176 :return: 177 the input text normalized to fit on lines with a maximized size 178 inferior to `line_len`, and optionally prefixed by an 179 indentation string 180 """ 181 text = _NORM_SPACES_RGX.sub(' ', text) 182 line_len = line_len - len(indent) 183 lines = [] 184 while text: 185 aline, text = splittext(text.strip(), line_len) 186 lines.append(indent + aline) 187 return linesep.join(lines)
188
189 -def normalize_rest_paragraph(text, line_len=80, indent=''):
190 """normalize a ReST text to display it with a maximum line size and 191 optionally arbitrary indentation. Line jumps are normalized. The 192 indentation string may be used top insert a comment mark for 193 instance. 194 195 :type text: str or unicode 196 :param text: the input text to normalize 197 198 :type line_len: int 199 :param line_len: expected maximum line's length, default to 80 200 201 :type indent: str or unicode 202 :param indent: optional string to use as indentation 203 204 :rtype: str or unicode 205 :return: 206 the input text normalized to fit on lines with a maximized size 207 inferior to `line_len`, and optionally prefixed by an 208 indentation string 209 """ 210 toreport = '' 211 lines = [] 212 line_len = line_len - len(indent) 213 for line in text.splitlines(): 214 line = toreport + _NORM_SPACES_RGX.sub(' ', line.strip()) 215 toreport = '' 216 while len(line) > line_len: 217 # too long line, need split 218 line, toreport = splittext(line, line_len) 219 lines.append(indent + line) 220 if toreport: 221 line = toreport + ' ' 222 toreport = '' 223 else: 224 line = '' 225 if line: 226 lines.append(indent + line.strip()) 227 return linesep.join(lines)
228 229
230 -def splittext(text, line_len):
231 """split the given text on space according to the given max line size 232 233 return a 2-uple: 234 * a line <= line_len if possible 235 * the rest of the text which has to be reported on another line 236 """ 237 if len(text) <= line_len: 238 return text, '' 239 pos = min(len(text)-1, line_len) 240 while pos > 0 and text[pos] != ' ': 241 pos -= 1 242 if pos == 0: 243 pos = min(len(text), line_len) 244 while len(text) > pos and text[pos] != ' ': 245 pos += 1 246 return text[:pos], text[pos+1:].strip()
247 248
249 -def splitstrip(string, sep=','):
250 """return a list of stripped string by splitting the string given as 251 argument on `sep` (',' by default). Empty string are discarded. 252 253 >>> splitstrip('a, b, c , 4,,') 254 ['a', 'b', 'c', '4'] 255 >>> splitstrip('a') 256 ['a'] 257 >>> 258 259 :type string: str or unicode 260 :param string: a csv line 261 262 :type sep: str or unicode 263 :param sep: field separator, default to the comma (',') 264 265 :rtype: str or unicode 266 :return: the unquoted string (or the input string if it wasn't quoted) 267 """ 268 return [word.strip() for word in string.split(sep) if word.strip()]
269 270 get_csv = deprecated('get_csv is deprecated, use splitstrip')(splitstrip) 271 272
273 -def split_url_or_path(url_or_path):
274 """return the latest component of a string containing either an url of the 275 form <scheme>://<path> or a local file system path 276 """ 277 if '://' in url_or_path: 278 return url_or_path.rstrip('/').rsplit('/', 1) 279 return osp.split(url_or_path.rstrip(osp.sep))
280 281
282 -def text_to_dict(text):
283 """parse multilines text containing simple 'key=value' lines and return a 284 dict of {'key': 'value'}. When the same key is encountered multiple time, 285 value is turned into a list containing all values. 286 287 >>> text_to_dict('''multiple=1 288 ... multiple= 2 289 ... single =3 290 ... ''') 291 {'single': '3', 'multiple': ['1', '2']} 292 293 """ 294 res = {} 295 if not text: 296 return res 297 for line in text.splitlines(): 298 line = line.strip() 299 if line and not line.startswith('#'): 300 key, value = [w.strip() for w in line.split('=', 1)] 301 if key in res: 302 try: 303 res[key].append(value) 304 except AttributeError: 305 res[key] = [res[key], value] 306 else: 307 res[key] = value 308 return res
309 310 311 _BLANK_URE = r'(\s|,)+' 312 _BLANK_RE = re.compile(_BLANK_URE) 313 __VALUE_URE = r'-?(([0-9]+\.[0-9]*)|((0x?)?[0-9]+))' 314 __UNITS_URE = r'[a-zA-Z]+' 315 _VALUE_RE = re.compile(r'(?P<value>%s)(?P<unit>%s)?'%(__VALUE_URE, __UNITS_URE)) 316 317 BYTE_UNITS = { 318 "b": 1, 319 "kb": 1024, 320 "mb": 1024 ** 2, 321 "gb": 1024 ** 3, 322 "tb": 1024 ** 4, 323 } 324 325 TIME_UNITS = { 326 "ms": 0.0001, 327 "s": 1, 328 "min": 60, 329 "h": 60 * 60, 330 "d": 60 * 60 *24, 331 } 332
333 -def apply_units( string, units, inter=None, final=float, blank_reg=_BLANK_RE, 334 value_reg=_VALUE_RE):
335 """Parse the string applying the units defined in units 336 (e.g.: "1.5m",{'m',60} -> 80). 337 338 :type string: str or unicode 339 :param string: the string to parse 340 341 :type units: dict (or any object with __getitem__ using basestring key) 342 :param units: a dict mapping a unit string repr to its value 343 344 :type inter: type 345 :param inter: used to parse every intermediate value (need __sum__) 346 347 :type blank_reg: regexp 348 :param blank_reg: should match every blank char to ignore. 349 350 :type value_reg: regexp with "value" and optional "unit" group 351 :param value_reg: match a value and it's unit into the 352 """ 353 if inter is None: 354 inter = final 355 string = _BLANK_RE.sub('', string) 356 values = [] 357 for match in value_reg.finditer(string): 358 dic = match.groupdict() 359 #import sys 360 #print >> sys.stderr, dic 361 lit, unit = dic["value"], dic.get("unit") 362 value = inter(lit) 363 if unit is not None: 364 try: 365 value *= units[unit.lower()] 366 except KeyError: 367 raise KeyError('invalid unit %s. valid units are %s' % 368 (unit, units.keys())) 369 values.append(value) 370 return final(sum(values))
371 372 373 _LINE_RGX = re.compile('\r\n|\r+|\n') 374
375 -def pretty_match(match, string, underline_char='^'):
376 """return a string with the match location underlined: 377 378 >>> import re 379 >>> print(pretty_match(re.search('mange', 'il mange du bacon'), 'il mange du bacon')) 380 il mange du bacon 381 ^^^^^ 382 >>> 383 384 :type match: _sre.SRE_match 385 :param match: object returned by re.match, re.search or re.finditer 386 387 :type string: str or unicode 388 :param string: 389 the string on which the regular expression has been applied to 390 obtain the `match` object 391 392 :type underline_char: str or unicode 393 :param underline_char: 394 character to use to underline the matched section, default to the 395 carret '^' 396 397 :rtype: str or unicode 398 :return: 399 the original string with an inserted line to underline the match 400 location 401 """ 402 start = match.start() 403 end = match.end() 404 string = _LINE_RGX.sub(linesep, string) 405 start_line_pos = string.rfind(linesep, 0, start) 406 if start_line_pos == -1: 407 start_line_pos = 0 408 result = [] 409 else: 410 result = [string[:start_line_pos]] 411 start_line_pos += len(linesep) 412 offset = start - start_line_pos 413 underline = ' ' * offset + underline_char * (end - start) 414 end_line_pos = string.find(linesep, end) 415 if end_line_pos == -1: 416 string = string[start_line_pos:] 417 result.append(string) 418 result.append(underline) 419 else: 420 end = string[end_line_pos + len(linesep):] 421 string = string[start_line_pos:end_line_pos] 422 result.append(string) 423 result.append(underline) 424 result.append(end) 425 return linesep.join(result).rstrip()
426 427 428 # Ansi colorization ########################################################### 429 430 ANSI_PREFIX = '\033[' 431 ANSI_END = 'm' 432 ANSI_RESET = '\033[0m' 433 ANSI_STYLES = { 434 'reset': "0", 435 'bold': "1", 436 'italic': "3", 437 'underline': "4", 438 'blink': "5", 439 'inverse': "7", 440 'strike': "9", 441 } 442 ANSI_COLORS = { 443 'reset': "0", 444 'black': "30", 445 'red': "31", 446 'green': "32", 447 'yellow': "33", 448 'blue': "34", 449 'magenta': "35", 450 'cyan': "36", 451 'white': "37", 452 } 453
454 -def _get_ansi_code(color=None, style=None):
455 """return ansi escape code corresponding to color and style 456 457 :type color: str or None 458 :param color: 459 the color name (see `ANSI_COLORS` for available values) 460 or the color number when 256 colors are available 461 462 :type style: str or None 463 :param style: 464 style string (see `ANSI_COLORS` for available values). To get 465 several style effects at the same time, use a coma as separator. 466 467 :raise KeyError: if an unexistent color or style identifier is given 468 469 :rtype: str 470 :return: the built escape code 471 """ 472 ansi_code = [] 473 if style: 474 style_attrs = splitstrip(style) 475 for effect in style_attrs: 476 ansi_code.append(ANSI_STYLES[effect]) 477 if color: 478 if color.isdigit(): 479 ansi_code.extend(['38', '5']) 480 ansi_code.append(color) 481 else: 482 ansi_code.append(ANSI_COLORS[color]) 483 if ansi_code: 484 return ANSI_PREFIX + ';'.join(ansi_code) + ANSI_END 485 return ''
486
487 -def colorize_ansi(msg, color=None, style=None):
488 """colorize message by wrapping it with ansi escape codes 489 490 :type msg: str or unicode 491 :param msg: the message string to colorize 492 493 :type color: str or None 494 :param color: 495 the color identifier (see `ANSI_COLORS` for available values) 496 497 :type style: str or None 498 :param style: 499 style string (see `ANSI_COLORS` for available values). To get 500 several style effects at the same time, use a coma as separator. 501 502 :raise KeyError: if an unexistent color or style identifier is given 503 504 :rtype: str or unicode 505 :return: the ansi escaped string 506 """ 507 # If both color and style are not defined, then leave the text as is 508 if color is None and style is None: 509 return msg 510 escape_code = _get_ansi_code(color, style) 511 # If invalid (or unknown) color, don't wrap msg with ansi codes 512 if escape_code: 513 return '%s%s%s' % (escape_code, msg, ANSI_RESET) 514 return msg
515 516 DIFF_STYLE = {'separator': 'cyan', 'remove': 'red', 'add': 'green'} 517
518 -def diff_colorize_ansi(lines, out=sys.stdout, style=DIFF_STYLE):
519 for line in lines: 520 if line[:4] in ('--- ', '+++ '): 521 out.write(colorize_ansi(line, style['separator'])) 522 elif line[0] == '-': 523 out.write(colorize_ansi(line, style['remove'])) 524 elif line[0] == '+': 525 out.write(colorize_ansi(line, style['add'])) 526 elif line[:4] == '--- ': 527 out.write(colorize_ansi(line, style['separator'])) 528 elif line[:4] == '+++ ': 529 out.write(colorize_ansi(line, style['separator'])) 530 else: 531 out.write(line)
532