Run data through BS4 to fix the text formatting and all that crap (will optimise later)

This commit is contained in:
Luke Rogers 2013-07-16 03:04:07 +12:00
parent 85ec22a005
commit a3669e6b50

View file

@ -12,15 +12,16 @@ def api_get(kind, query):
def get_info(url):
try:
page = http.get(url)
soup = http.get_soup(url)
except Exception as e:
return "Could not get SCP information: Unable to fetch URL. ({})".format(e)
contents = re.sub('<[^<]+?>', '', page)
safe_html = unicode(soup)
contents = re.sub('<[^<]+?>', '', safe_html)
try:
item_id = http.unescape(re.findall("Item #: (.+?)\n", contents, re.S)[0])
object_class = http.unescape(re.findall("Object Class: (.+?)\n", contents, re.S)[0])
description = http.unescape(re.findall("Description: (.+?)\n", contents, re.S)[0])
item_id = re.findall("Item #: (.+?)\n", contents, re.S)[0]
object_class = re.findall("Object Class: (.+?)\n", contents, re.S)[0]
description = re.findall("Description: (.+?)\n", contents, re.S)[0]
except IndexError as e:
return "Could not get SCP information: Page was not a valid SCP page."