move lib to core, no more sys.path fucking @cybojenix
This commit is contained in:
parent
ef48b81924
commit
7dc1daa69f
14 changed files with 911 additions and 4 deletions
230
util/text.py
Normal file
230
util/text.py
Normal file
|
@ -0,0 +1,230 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
""" formatting.py - handy functions for formatting text
|
||||
this file contains code from the following URL:
|
||||
<http://code.djangoproject.com/svn/django/trunk/django/utils/text.py>
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from HTMLParser import HTMLParser
|
||||
import htmlentitydefs
|
||||
|
||||
|
||||
class HTMLTextExtractor(HTMLParser):
|
||||
def __init__(self):
|
||||
HTMLParser.__init__(self)
|
||||
self.result = []
|
||||
|
||||
def handle_data(self, d):
|
||||
self.result.append(d)
|
||||
|
||||
def handle_charref(self, number):
|
||||
codepoint = int(number[1:], 16) if number[0] in (u'x', u'X') else int(number)
|
||||
self.result.append(unichr(codepoint))
|
||||
|
||||
def handle_entityref(self, name):
|
||||
codepoint = htmlentitydefs.name2codepoint[name]
|
||||
self.result.append(unichr(codepoint))
|
||||
|
||||
def get_text(self):
|
||||
return u''.join(self.result)
|
||||
|
||||
|
||||
def strip_html(html):
|
||||
s = HTMLTextExtractor()
|
||||
s.feed(html)
|
||||
return s.get_text()
|
||||
|
||||
|
||||
def munge(text, munge_count=0):
|
||||
"""munges up text."""
|
||||
reps = 0
|
||||
for n in xrange(len(text)):
|
||||
rep = character_replacements.get(text[n])
|
||||
if rep:
|
||||
text = text[:n] + rep.decode('utf8') + text[n + 1:]
|
||||
reps += 1
|
||||
if reps == munge_count:
|
||||
break
|
||||
return text
|
||||
|
||||
|
||||
character_replacements = {
|
||||
'a': 'ä',
|
||||
'b': 'Б',
|
||||
'c': 'ċ',
|
||||
'd': 'đ',
|
||||
'e': 'ë',
|
||||
'f': 'ƒ',
|
||||
'g': 'ġ',
|
||||
'h': 'ħ',
|
||||
'i': 'í',
|
||||
'j': 'ĵ',
|
||||
'k': 'ķ',
|
||||
'l': 'ĺ',
|
||||
'm': 'ṁ',
|
||||
'n': 'ñ',
|
||||
'o': 'ö',
|
||||
'p': 'ρ',
|
||||
'q': 'ʠ',
|
||||
'r': 'ŗ',
|
||||
's': 'š',
|
||||
't': 'ţ',
|
||||
'u': 'ü',
|
||||
'v': '',
|
||||
'w': 'ω',
|
||||
'x': 'χ',
|
||||
'y': 'ÿ',
|
||||
'z': 'ź',
|
||||
'A': 'Å',
|
||||
'B': 'Β',
|
||||
'C': 'Ç',
|
||||
'D': 'Ď',
|
||||
'E': 'Ē',
|
||||
'F': 'Ḟ',
|
||||
'G': 'Ġ',
|
||||
'H': 'Ħ',
|
||||
'I': 'Í',
|
||||
'J': 'Ĵ',
|
||||
'K': 'Ķ',
|
||||
'L': 'Ĺ',
|
||||
'M': 'Μ',
|
||||
'N': 'Ν',
|
||||
'O': 'Ö',
|
||||
'P': 'Р',
|
||||
'Q': 'Q',
|
||||
'R': 'Ŗ',
|
||||
'S': 'Š',
|
||||
'T': 'Ţ',
|
||||
'U': 'Ů',
|
||||
'V': 'Ṿ',
|
||||
'W': 'Ŵ',
|
||||
'X': 'Χ',
|
||||
'Y': 'Ỳ',
|
||||
'Z': 'Ż'}
|
||||
|
||||
|
||||
def capitalize_first(line):
|
||||
"""
|
||||
capitalises the first letter of words
|
||||
(keeps other letters intact)
|
||||
"""
|
||||
return ' '.join([s[0].upper() + s[1:] for s in line.split(' ')])
|
||||
|
||||
|
||||
def multiword_replace(text, wordDic):
|
||||
"""
|
||||
take a text and replace words that match a key in a dictionary with
|
||||
the associated value, return the changed text
|
||||
"""
|
||||
rc = re.compile('|'.join(map(re.escape, wordDic)))
|
||||
|
||||
def translate(match):
|
||||
return wordDic[match.group(0)]
|
||||
return rc.sub(translate, text)
|
||||
|
||||
|
||||
def truncate_words(content, length=10, suffix='...'):
|
||||
"""Truncates a string after a certain number of words."""
|
||||
nmsg = content.split(" ")
|
||||
out = None
|
||||
x = 0
|
||||
for i in nmsg:
|
||||
if x <= length:
|
||||
if out:
|
||||
out = out + " " + nmsg[x]
|
||||
else:
|
||||
out = nmsg[x]
|
||||
x += 1
|
||||
if x <= length:
|
||||
return out
|
||||
else:
|
||||
return out + suffix
|
||||
|
||||
|
||||
# from <http://stackoverflow.com/questions/250357/smart-truncate-in-python>
|
||||
def truncate_str(content, length=100, suffix='...'):
|
||||
"""Truncates a string after a certain number of chars."""
|
||||
if len(content) <= length:
|
||||
return content
|
||||
else:
|
||||
return content[:length].rsplit(' ', 1)[0] + suffix
|
||||
|
||||
|
||||
# ALL CODE BELOW THIS LINE IS COVERED BY THE FOLLOWING AGREEMENT:
|
||||
|
||||
# Copyright (c) Django Software Foundation and individual contributors.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of Django nor the names of its contributors may be used
|
||||
# to endorse or promote products derived from this software without
|
||||
# specific prior written permission.
|
||||
#
|
||||
#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"AND
|
||||
#ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
#WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
#DISCLAIMED.IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
#ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
#(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
#LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||
#ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
#SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
# Expression to match some_token and some_token="with spaces" (and similarly
|
||||
# for single-quoted strings).
|
||||
|
||||
split_re = re.compile(r"""((?:[^\s'"]*(?:(?:"(?:[^"\\]|\\.)*" | '(?:[""" \
|
||||
r"""^'\\]|\\.)*')[^\s'"]*)+) | \S+)""", re.VERBOSE)
|
||||
|
||||
|
||||
def smart_split(text):
|
||||
r"""
|
||||
Generator that splits a string by spaces, leaving quoted phrases together.
|
||||
Supports both single and double quotes, and supports escaping quotes with
|
||||
backslashes. In the output, strings will keep their initial and trailing
|
||||
quote marks and escaped quotes will remain escaped (the results can then
|
||||
be further processed with unescape_string_literal()).
|
||||
|
||||
>>> list(smart_split(r'This is "a person\'s" test.'))
|
||||
[u'This', u'is', u'"a person\\\'s"', u'test.']
|
||||
>>> list(smart_split(r"Another 'person\'s' test."))
|
||||
[u'Another', u"'person\\'s'", u'test.']
|
||||
>>> list(smart_split(r'A "\"funky\" style" test.'))
|
||||
[u'A', u'"\\"funky\\" style"', u'test.']
|
||||
"""
|
||||
for bit in split_re.finditer(text):
|
||||
yield bit.group(0)
|
||||
|
||||
|
||||
def get_text_list(list_, last_word='or'):
|
||||
"""
|
||||
>>> get_text_list(['a', 'b', 'c', 'd'])
|
||||
u'a, b, c or d'
|
||||
>>> get_text_list(['a', 'b', 'c'], 'and')
|
||||
u'a, b and c'
|
||||
>>> get_text_list(['a', 'b'], 'and')
|
||||
u'a and b'
|
||||
>>> get_text_list(['a'])
|
||||
u'a'
|
||||
>>> get_text_list([])
|
||||
u''
|
||||
"""
|
||||
if len(list_) == 0:
|
||||
return ''
|
||||
if len(list_) == 1:
|
||||
return list_[0]
|
||||
return '%s %s %s' % (
|
||||
# Translators: This string is used as a separator between list elements
|
||||
', '.join([i for i in list_][:-1]),
|
||||
last_word, list_[-1])
|
Reference in a new issue