2012-11-10 16:47:12 +01:00
|
|
|
|
# -*- coding: utf-8 -*-
|
2012-04-21 18:26:24 +02:00
|
|
|
|
""" formatting.py - handy functions for formatting text
|
|
|
|
|
this file contains code from the following URL:
|
|
|
|
|
<http://code.djangoproject.com/svn/django/trunk/django/utils/text.py>
|
|
|
|
|
"""
|
2012-04-21 18:35:10 +02:00
|
|
|
|
|
2012-04-21 18:26:24 +02:00
|
|
|
|
import re
|
2012-04-20 16:35:35 +02:00
|
|
|
|
|
2013-09-05 01:00:04 +02:00
|
|
|
|
from HTMLParser import HTMLParser
|
|
|
|
|
import htmlentitydefs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class HTMLTextExtractor(HTMLParser):
|
|
|
|
|
def __init__(self):
|
|
|
|
|
HTMLParser.__init__(self)
|
|
|
|
|
self.result = []
|
|
|
|
|
|
|
|
|
|
def handle_data(self, d):
|
|
|
|
|
self.result.append(d)
|
|
|
|
|
|
|
|
|
|
def handle_charref(self, number):
|
|
|
|
|
codepoint = int(number[1:], 16) if number[0] in (u'x', u'X') else int(number)
|
|
|
|
|
self.result.append(unichr(codepoint))
|
|
|
|
|
|
|
|
|
|
def handle_entityref(self, name):
|
|
|
|
|
codepoint = htmlentitydefs.name2codepoint[name]
|
|
|
|
|
self.result.append(unichr(codepoint))
|
|
|
|
|
|
|
|
|
|
def get_text(self):
|
|
|
|
|
return u''.join(self.result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def strip_html(html):
|
|
|
|
|
s = HTMLTextExtractor()
|
|
|
|
|
s.feed(html)
|
|
|
|
|
return s.get_text()
|
|
|
|
|
|
2012-04-21 06:03:08 +02:00
|
|
|
|
|
2012-11-10 16:47:12 +01:00
|
|
|
|
def munge(text, munge_count=0):
|
2013-09-04 12:30:04 +02:00
|
|
|
|
"""munges up text."""
|
2012-11-10 16:47:12 +01:00
|
|
|
|
reps = 0
|
|
|
|
|
for n in xrange(len(text)):
|
|
|
|
|
rep = character_replacements.get(text[n])
|
|
|
|
|
if rep:
|
|
|
|
|
text = text[:n] + rep.decode('utf8') + text[n + 1:]
|
|
|
|
|
reps += 1
|
|
|
|
|
if reps == munge_count:
|
|
|
|
|
break
|
|
|
|
|
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
character_replacements = {
|
|
|
|
|
'a': 'ä',
|
|
|
|
|
'b': 'Б',
|
|
|
|
|
'c': 'ċ',
|
|
|
|
|
'd': 'đ',
|
|
|
|
|
'e': 'ë',
|
|
|
|
|
'f': 'ƒ',
|
|
|
|
|
'g': 'ġ',
|
|
|
|
|
'h': 'ħ',
|
|
|
|
|
'i': 'í',
|
|
|
|
|
'j': 'ĵ',
|
|
|
|
|
'k': 'ķ',
|
|
|
|
|
'l': 'ĺ',
|
|
|
|
|
'm': 'ṁ',
|
|
|
|
|
'n': 'ñ',
|
|
|
|
|
'o': 'ö',
|
|
|
|
|
'p': 'ρ',
|
|
|
|
|
'q': 'ʠ',
|
|
|
|
|
'r': 'ŗ',
|
|
|
|
|
's': 'š',
|
|
|
|
|
't': 'ţ',
|
|
|
|
|
'u': 'ü',
|
|
|
|
|
'v': '',
|
|
|
|
|
'w': 'ω',
|
|
|
|
|
'x': 'χ',
|
|
|
|
|
'y': 'ÿ',
|
|
|
|
|
'z': 'ź',
|
|
|
|
|
'A': 'Å',
|
|
|
|
|
'B': 'Β',
|
|
|
|
|
'C': 'Ç',
|
|
|
|
|
'D': 'Ď',
|
|
|
|
|
'E': 'Ē',
|
|
|
|
|
'F': 'Ḟ',
|
|
|
|
|
'G': 'Ġ',
|
|
|
|
|
'H': 'Ħ',
|
|
|
|
|
'I': 'Í',
|
|
|
|
|
'J': 'Ĵ',
|
|
|
|
|
'K': 'Ķ',
|
|
|
|
|
'L': 'Ĺ',
|
|
|
|
|
'M': 'Μ',
|
|
|
|
|
'N': 'Ν',
|
|
|
|
|
'O': 'Ö',
|
|
|
|
|
'P': 'Р',
|
|
|
|
|
'Q': 'Q',
|
|
|
|
|
'R': 'Ŗ',
|
|
|
|
|
'S': 'Š',
|
|
|
|
|
'T': 'Ţ',
|
|
|
|
|
'U': 'Ů',
|
|
|
|
|
'V': 'Ṿ',
|
|
|
|
|
'W': 'Ŵ',
|
|
|
|
|
'X': 'Χ',
|
|
|
|
|
'Y': 'Ỳ',
|
|
|
|
|
'Z': 'Ż'}
|
|
|
|
|
|
|
|
|
|
|
2012-04-20 16:35:35 +02:00
|
|
|
|
def capitalize_first(line):
|
2012-04-21 18:26:24 +02:00
|
|
|
|
"""
|
|
|
|
|
capitalises the first letter of words
|
|
|
|
|
(keeps other letters intact)
|
2012-04-20 16:35:35 +02:00
|
|
|
|
"""
|
|
|
|
|
return ' '.join([s[0].upper() + s[1:] for s in line.split(' ')])
|
2012-04-23 11:46:52 +02:00
|
|
|
|
|
2012-05-13 22:12:58 +02:00
|
|
|
|
|
2012-06-11 23:45:23 +02:00
|
|
|
|
def multiword_replace(text, wordDic):
|
|
|
|
|
"""
|
|
|
|
|
take a text and replace words that match a key in a dictionary with
|
|
|
|
|
the associated value, return the changed text
|
|
|
|
|
"""
|
|
|
|
|
rc = re.compile('|'.join(map(re.escape, wordDic)))
|
|
|
|
|
|
|
|
|
|
def translate(match):
|
|
|
|
|
return wordDic[match.group(0)]
|
|
|
|
|
return rc.sub(translate, text)
|
|
|
|
|
|
|
|
|
|
|
2013-09-04 09:52:38 +02:00
|
|
|
|
def truncate_words(content, length=10, suffix='...'):
|
2013-09-04 12:30:04 +02:00
|
|
|
|
"""Truncates a string after a certain number of words."""
|
2013-09-04 09:52:38 +02:00
|
|
|
|
nmsg = content.split(" ")
|
|
|
|
|
out = None
|
|
|
|
|
x = 0
|
|
|
|
|
for i in nmsg:
|
|
|
|
|
if x <= length:
|
|
|
|
|
if out:
|
|
|
|
|
out = out + " " + nmsg[x]
|
|
|
|
|
else:
|
|
|
|
|
out = nmsg[x]
|
2013-09-04 12:30:04 +02:00
|
|
|
|
x += 1
|
2013-09-04 09:52:38 +02:00
|
|
|
|
if x <= length:
|
|
|
|
|
return out
|
|
|
|
|
else:
|
|
|
|
|
return out + suffix
|
|
|
|
|
|
|
|
|
|
|
2012-05-13 22:12:58 +02:00
|
|
|
|
# from <http://stackoverflow.com/questions/250357/smart-truncate-in-python>
|
2012-10-13 01:33:17 +02:00
|
|
|
|
def truncate_str(content, length=100, suffix='...'):
|
2013-09-04 12:30:04 +02:00
|
|
|
|
"""Truncates a string after a certain number of chars."""
|
2012-05-13 22:12:58 +02:00
|
|
|
|
if len(content) <= length:
|
|
|
|
|
return content
|
|
|
|
|
else:
|
|
|
|
|
return content[:length].rsplit(' ', 1)[0] + suffix
|
|
|
|
|
|
|
|
|
|
|
2012-04-21 18:35:10 +02:00
|
|
|
|
# ALL CODE BELOW THIS LINE IS COVERED BY THE FOLLOWING AGREEMENT:
|
|
|
|
|
|
|
|
|
|
# Copyright (c) Django Software Foundation and individual contributors.
|
|
|
|
|
# All rights reserved.
|
|
|
|
|
#
|
|
|
|
|
# Redistribution and use in source and binary forms, with or without
|
|
|
|
|
# modification, are permitted provided that the following conditions are met:
|
|
|
|
|
#
|
|
|
|
|
# 1. Redistributions of source code must retain the above copyright notice,
|
|
|
|
|
# this list of conditions and the following disclaimer.
|
|
|
|
|
#
|
|
|
|
|
# 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
|
# notice, this list of conditions and the following disclaimer in the
|
|
|
|
|
# documentation and/or other materials provided with the distribution.
|
|
|
|
|
#
|
|
|
|
|
# 3. Neither the name of Django nor the names of its contributors may be used
|
|
|
|
|
# to endorse or promote products derived from this software without
|
|
|
|
|
# specific prior written permission.
|
|
|
|
|
#
|
|
|
|
|
#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"AND
|
|
|
|
|
#ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
|
|
|
#WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
|
|
|
#DISCLAIMED.IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
|
|
|
|
#ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
|
|
|
#(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
|
|
|
#LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
|
|
|
|
#ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
|
|
|
#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
|
|
|
#SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
2012-04-21 18:26:24 +02:00
|
|
|
|
|
|
|
|
|
# Expression to match some_token and some_token="with spaces" (and similarly
|
|
|
|
|
# for single-quoted strings).
|
2012-05-13 22:12:58 +02:00
|
|
|
|
|
2012-04-21 18:26:24 +02:00
|
|
|
|
split_re = re.compile(r"""((?:[^\s'"]*(?:(?:"(?:[^"\\]|\\.)*" | '(?:[""" \
|
|
|
|
|
r"""^'\\]|\\.)*')[^\s'"]*)+) | \S+)""", re.VERBOSE)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def smart_split(text):
|
|
|
|
|
r"""
|
|
|
|
|
Generator that splits a string by spaces, leaving quoted phrases together.
|
|
|
|
|
Supports both single and double quotes, and supports escaping quotes with
|
|
|
|
|
backslashes. In the output, strings will keep their initial and trailing
|
|
|
|
|
quote marks and escaped quotes will remain escaped (the results can then
|
|
|
|
|
be further processed with unescape_string_literal()).
|
|
|
|
|
|
|
|
|
|
>>> list(smart_split(r'This is "a person\'s" test.'))
|
|
|
|
|
[u'This', u'is', u'"a person\\\'s"', u'test.']
|
|
|
|
|
>>> list(smart_split(r"Another 'person\'s' test."))
|
|
|
|
|
[u'Another', u"'person\\'s'", u'test.']
|
|
|
|
|
>>> list(smart_split(r'A "\"funky\" style" test.'))
|
|
|
|
|
[u'A', u'"\\"funky\\" style"', u'test.']
|
|
|
|
|
"""
|
|
|
|
|
for bit in split_re.finditer(text):
|
|
|
|
|
yield bit.group(0)
|
2012-04-23 11:46:52 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_text_list(list_, last_word='or'):
|
|
|
|
|
"""
|
|
|
|
|
>>> get_text_list(['a', 'b', 'c', 'd'])
|
|
|
|
|
u'a, b, c or d'
|
|
|
|
|
>>> get_text_list(['a', 'b', 'c'], 'and')
|
|
|
|
|
u'a, b and c'
|
|
|
|
|
>>> get_text_list(['a', 'b'], 'and')
|
|
|
|
|
u'a and b'
|
|
|
|
|
>>> get_text_list(['a'])
|
|
|
|
|
u'a'
|
|
|
|
|
>>> get_text_list([])
|
|
|
|
|
u''
|
|
|
|
|
"""
|
2012-09-04 21:52:03 +02:00
|
|
|
|
if len(list_) == 0:
|
|
|
|
|
return ''
|
|
|
|
|
if len(list_) == 1:
|
|
|
|
|
return list_[0]
|
2012-04-23 11:46:52 +02:00
|
|
|
|
return '%s %s %s' % (
|
|
|
|
|
# Translators: This string is used as a separator between list elements
|
2013-09-04 12:30:04 +02:00
|
|
|
|
', '.join([i for i in list_][:-1]),
|
2012-04-23 11:46:52 +02:00
|
|
|
|
last_word, list_[-1])
|