Run data through BS4 to fix the text formatting and all that crap (will optimise later)
This commit is contained in:
parent
85ec22a005
commit
a3669e6b50
1 changed files with 6 additions and 5 deletions
|
@ -12,15 +12,16 @@ def api_get(kind, query):
|
||||||
|
|
||||||
def get_info(url):
|
def get_info(url):
|
||||||
try:
|
try:
|
||||||
page = http.get(url)
|
soup = http.get_soup(url)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return "Could not get SCP information: Unable to fetch URL. ({})".format(e)
|
return "Could not get SCP information: Unable to fetch URL. ({})".format(e)
|
||||||
contents = re.sub('<[^<]+?>', '', page)
|
safe_html = unicode(soup)
|
||||||
|
contents = re.sub('<[^<]+?>', '', safe_html)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
item_id = http.unescape(re.findall("Item #: (.+?)\n", contents, re.S)[0])
|
item_id = re.findall("Item #: (.+?)\n", contents, re.S)[0]
|
||||||
object_class = http.unescape(re.findall("Object Class: (.+?)\n", contents, re.S)[0])
|
object_class = re.findall("Object Class: (.+?)\n", contents, re.S)[0]
|
||||||
description = http.unescape(re.findall("Description: (.+?)\n", contents, re.S)[0])
|
description = re.findall("Description: (.+?)\n", contents, re.S)[0]
|
||||||
except IndexError as e:
|
except IndexError as e:
|
||||||
return "Could not get SCP information: Page was not a valid SCP page."
|
return "Could not get SCP information: Page was not a valid SCP page."
|
||||||
|
|
||||||
|
|
Reference in a new issue