1 """Some text manipulation utility functions.
2
3 :author: Logilab
4 :copyright: 2003-2008 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
5 :contact: http://www.logilab.fr/ -- mailto:contact@logilab.fr
6 :license: General Public License version 2 - http://www.gnu.org/licenses
7
8 :group text formatting: normalize_text, normalize_paragraph, pretty_match,\
9 unquote, colorize_ansi
10 :group text manipulation: searchall, splitstrip
11 :sort: text formatting, text manipulation
12
13 :type ANSI_STYLES: dict(str)
14 :var ANSI_STYLES: dictionary mapping style identifier to ANSI terminal code
15
16 :type ANSI_COLORS: dict(str)
17 :var ANSI_COLORS: dictionary mapping color identifier to ANSI terminal code
18
19 :type ANSI_PREFIX: str
20 :var ANSI_PREFIX:
21 ANSI terminal code notifying the start of an ANSI escape sequence
22
23 :type ANSI_END: str
24 :var ANSI_END:
25 ANSI terminal code notifying the end of an ANSI escape sequence
26
27 :type ANSI_RESET: str
28 :var ANSI_RESET:
29 ANSI terminal code resetting format defined by a previous ANSI escape sequence
30 """
31 __docformat__ = "restructuredtext en"
32
33 import sys
34 import re
35 from unicodedata import normalize as _uninormalize
36 try:
37 from os import linesep
38 except ImportError:
39 linesep = '\n'
40
41 from logilab.common.deprecation import deprecated
42
43 MANUAL_UNICODE_MAP = {
44 u'\xa1': u'!',
45 u'\u0142': u'l',
46 u'\u2044': u'/',
47 u'\xc6': u'AE',
48 u'\xa9': u'(c)',
49 u'\xab': u'"',
50 u'\xe6': u'ae',
51 u'\xae': u'(r)',
52 u'\u0153': u'oe',
53 u'\u0152': u'OE',
54 u'\xd8': u'O',
55 u'\xf8': u'o',
56 u'\xbb': u'"',
57 u'\xdf': u'ss',
58 }
59
61 """replace diacritical characters with their corresponding ascii characters
62 """
63 res = []
64 for letter in ustring[:]:
65 try:
66 replacement = MANUAL_UNICODE_MAP[letter]
67 except KeyError:
68 if ord(letter) >= 2**8:
69 if ignorenonascii:
70 continue
71 raise ValueError("can't deal with non-ascii based characters")
72 replacement = _uninormalize('NFD', letter)[0]
73 res.append(replacement)
74 return u''.join(res)
75
77 """remove optional quotes (simple or double) from the string
78
79 :type string: str or unicode
80 :param string: an optionally quoted string
81
82 :rtype: str or unicode
83 :return: the unquoted string (or the input string if it wasn't quoted)
84 """
85 if not string:
86 return string
87 if string[0] in '"\'':
88 string = string[1:]
89 if string[-1] in '"\'':
90 string = string[:-1]
91 return string
92
93
94 _BLANKLINES_RGX = re.compile('\r?\n\r?\n')
95 _NORM_SPACES_RGX = re.compile('\s+')
96
97 -def normalize_text(text, line_len=80, indent='', rest=False):
98 """normalize a text to display it with a maximum line size and
99 optionally arbitrary indentation. Line jumps are normalized but blank
100 lines are kept. The indentation string may be used to insert a
101 comment (#) or a quoting (>) mark for instance.
102
103 :type text: str or unicode
104 :param text: the input text to normalize
105
106 :type line_len: int
107 :param line_len: expected maximum line's length, default to 80
108
109 :type indent: str or unicode
110 :param indent: optional string to use as indentation
111
112 :rtype: str or unicode
113 :return:
114 the input text normalized to fit on lines with a maximized size
115 inferior to `line_len`, and optionally prefixed by an
116 indentation string
117 """
118 if rest:
119 normp = normalize_rest_paragraph
120 else:
121 normp = normalize_paragraph
122 result = []
123 for text in _BLANKLINES_RGX.split(text):
124 result.append(normp(text, line_len, indent))
125 return ('%s%s%s' % (linesep, indent, linesep)).join(result)
126
127
129 """normalize a text to display it with a maximum line size and
130 optionally arbitrary indentation. Line jumps are normalized. The
131 indentation string may be used top insert a comment mark for
132 instance.
133
134 :type text: str or unicode
135 :param text: the input text to normalize
136
137 :type line_len: int
138 :param line_len: expected maximum line's length, default to 80
139
140 :type indent: str or unicode
141 :param indent: optional string to use as indentation
142
143 :rtype: str or unicode
144 :return:
145 the input text normalized to fit on lines with a maximized size
146 inferior to `line_len`, and optionally prefixed by an
147 indentation string
148 """
149 text = _NORM_SPACES_RGX.sub(' ', text)
150 line_len = line_len - len(indent)
151 lines = []
152 while text:
153 aline, text = splittext(text.strip(), line_len)
154 lines.append(indent + aline)
155 return linesep.join(lines)
156
158 """normalize a ReST text to display it with a maximum line size and
159 optionally arbitrary indentation. Line jumps are normalized. The
160 indentation string may be used top insert a comment mark for
161 instance.
162
163 :type text: str or unicode
164 :param text: the input text to normalize
165
166 :type line_len: int
167 :param line_len: expected maximum line's length, default to 80
168
169 :type indent: str or unicode
170 :param indent: optional string to use as indentation
171
172 :rtype: str or unicode
173 :return:
174 the input text normalized to fit on lines with a maximized size
175 inferior to `line_len`, and optionally prefixed by an
176 indentation string
177 """
178 toreport = ''
179 lines = []
180 line_len = line_len - len(indent)
181 for line in text.splitlines():
182 line = toreport + _NORM_SPACES_RGX.sub(' ', line.strip())
183 toreport = ''
184 while len(line) > line_len:
185
186 line, toreport = splittext(line, line_len)
187 lines.append(indent + line)
188 if toreport:
189 line = toreport + ' '
190 toreport = ''
191 else:
192 line = ''
193 if line:
194 lines.append(indent + line.strip())
195 return linesep.join(lines)
196
197 -def splittext(text, line_len):
198 """split the given text on space according to the given max line size
199
200 return a 2-uple:
201 * a line <= line_len if possible
202 * the rest of the text which has to be reported on another line
203 """
204 if len(text) <= line_len:
205 return text, ''
206 pos = min(len(text)-1, line_len)
207 while pos > 0 and text[pos] != ' ':
208 pos -= 1
209 if pos == 0:
210 pos = min(len(text), line_len)
211 while len(text) > pos and text[pos] != ' ':
212 pos += 1
213 return text[:pos], text[pos+1:].strip()
214
215
217 """return a list of stripped string by splitting the string given as
218 argument on `sep` (',' by default). Empty string are discarded.
219
220 >>> splitstrip('a, b, c , 4,,')
221 ['a', 'b', 'c', '4']
222 >>> splitstrip('a')
223 ['a']
224 >>>
225
226 :type string: str or unicode
227 :param string: a csv line
228
229 :type sep: str or unicode
230 :param sep: field separator, default to the comma (',')
231
232 :rtype: str or unicode
233 :return: the unquoted string (or the input string if it wasn't quoted)
234 """
235 return [word.strip() for word in string.split(sep) if word.strip()]
236
237 get_csv = deprecated()(splitstrip)
238
239 _BLANK_URE = r'(\s|,)+'
240 _BLANK_RE = re.compile(_BLANK_URE)
241 __VALUE_URE = r'-?(([0-9]+\.[0-9]*)|((0x?)?[0-9]+))'
242 __UNITS_URE = r'[a-zA-Z]+'
243 _VALUE_RE = re.compile(r'(?P<value>%s)(?P<unit>%s)?'%(__VALUE_URE,__UNITS_URE))
244
245 BYTE_UNITS = {
246 "B": 1,
247 "KB": 1024,
248 "MB": 1024 ** 2,
249 "GB": 1024 ** 3,
250 "TB": 1024 ** 4,
251 }
252
253 TIME_UNITS = {
254 "ms": 0.0001,
255 "s": 1,
256 "min": 60,
257 "h": 60 * 60,
258 "d": 60 * 60 *24,
259 }
260
263 """Parse the string applying the units defined in units
264 (e.g.: "1.5m",{'m',60} -> 80).
265
266 :type string: str or unicode
267 :param string: the string to parse
268
269 :type units: dict (or any object with __getitem__ using basestring key)
270 :param units: a dict mapping a unit string repr to its value
271
272 :type inter: type
273 :param inter: used to parse every intermediate value (need __sum__)
274
275 :type blank_reg: regexp
276 :param blank_reg: should match every blank char to ignore.
277
278 :type value_reg: regexp with "value" and optional "unit" group
279 :param value_reg: match a value and it's unit into the
280 """
281 if inter is None:
282 inter = final
283 string = _BLANK_RE.sub('',string)
284 values = []
285 for match in value_reg.finditer(string):
286 dic = match.groupdict()
287
288
289 lit, unit = dic["value"], dic.get("unit")
290 value = inter(lit)
291 if unit is not None:
292 value *= units[unit]
293 values.append(value)
294 return final(sum(values))
295
296 _LINE_RGX = re.compile('\r\n|\r+|\n')
297
299 """return a string with the match location underlined:
300
301 >>> import re
302 >>> print pretty_match(re.search('mange', 'il mange du bacon'), 'il mange du bacon')
303 il mange du bacon
304 ^^^^^
305 >>>
306
307 :type match: _sre.SRE_match
308 :param match: object returned by re.match, re.search or re.finditer
309
310 :type string: str or unicode
311 :param string:
312 the string on which the regular expression has been applied to
313 obtain the `match` object
314
315 :type underline_char: str or unicode
316 :param underline_char:
317 character to use to underline the matched section, default to the
318 carret '^'
319
320 :rtype: str or unicode
321 :return:
322 the original string with an inserted line to underline the match
323 location
324 """
325 start = match.start()
326 end = match.end()
327 string = _LINE_RGX.sub(linesep, string)
328 start_line_pos = string.rfind(linesep, 0, start)
329 if start_line_pos == -1:
330 start_line_pos = 0
331 result = []
332 else:
333 result = [string[:start_line_pos]]
334 start_line_pos += len(linesep)
335 offset = start - start_line_pos
336 underline = ' ' * offset + underline_char * (end - start)
337 end_line_pos = string.find(linesep, end)
338 if end_line_pos == -1:
339 string = string[start_line_pos:]
340 result.append(string)
341 result.append(underline)
342 else:
343 end = string[end_line_pos + len(linesep):]
344 string = string[start_line_pos:end_line_pos]
345 result.append(string)
346 result.append(underline)
347 result.append(end)
348 return linesep.join(result).rstrip()
349
350
351
352
353 ANSI_PREFIX = '\033['
354 ANSI_END = 'm'
355 ANSI_RESET = '\033[0m'
356 ANSI_STYLES = {
357 'reset' : "0",
358 'bold' : "1",
359 'italic' : "3",
360 'underline' : "4",
361 'blink' : "5",
362 'inverse' : "7",
363 'strike' : "9",
364 }
365 ANSI_COLORS = {
366 'reset' : "0",
367 'black' : "30",
368 'red' : "31",
369 'green' : "32",
370 'yellow' : "33",
371 'blue' : "34",
372 'magenta' : "35",
373 'cyan' : "36",
374 'white' : "37",
375 }
376
377
379 """return ansi escape code corresponding to color and style
380
381 :type color: str or None
382 :param color:
383 the color identifier (see `ANSI_COLORS` for available values)
384
385 :type style: str or None
386 :param style:
387 style string (see `ANSI_COLORS` for available values). To get
388 several style effects at the same time, use a coma as separator.
389
390 :raise KeyError: if an unexistent color or style identifier is given
391
392 :rtype: str
393 :return: the built escape code
394 """
395 ansi_code = []
396 if style:
397 style_attrs = splitstrip(style)
398 for effect in style_attrs:
399 ansi_code.append(ANSI_STYLES[effect])
400 if color:
401 ansi_code.append(ANSI_COLORS[color])
402 if ansi_code:
403 return ANSI_PREFIX + ';'.join(ansi_code) + ANSI_END
404 return ''
405
407 """colorize message by wrapping it with ansi escape codes
408
409 :type msg: str or unicode
410 :param msg: the message string to colorize
411
412 :type color: str or None
413 :param color:
414 the color identifier (see `ANSI_COLORS` for available values)
415
416 :type style: str or None
417 :param style:
418 style string (see `ANSI_COLORS` for available values). To get
419 several style effects at the same time, use a coma as separator.
420
421 :raise KeyError: if an unexistent color or style identifier is given
422
423 :rtype: str or unicode
424 :return: the ansi escaped string
425 """
426
427 if color is None and style is None:
428 return msg
429 escape_code = _get_ansi_code(color, style)
430
431 if escape_code:
432 return '%s%s%s' % (escape_code, msg, ANSI_RESET)
433 return msg
434
435 DIFF_STYLE = {'separator': 'cyan', 'remove': 'red', 'add': 'green'}
436
451