solo-auswertung/src/solo_turnier/html_parser.py

271 lines
8.9 KiB
Python

from bs4 import BeautifulSoup
import logging
import re
from .types import HtmlPreviewParticipant, HtmlParticipant, HtmlResultTotalTable
from .types import HtmlPreviewImport as HtmlImport, HtmlResultImport
from .group import GroupParser
from .competition_class import CompetitionClassParser
class IncompleteRoundException(Exception):
def __init__(self, *args):
super(IncompleteRoundException, self).__init__(*args)
class HtmlParser:
def __init__(self, text: str, fileName: str = None):
self.l = logging.getLogger("solo_turnier.html_parser")
self.soup = BeautifulSoup(text, "html.parser")
self.fileName = fileName
self.groupParser = GroupParser()
self.classParser = CompetitionClassParser()
def __repr__(self):
if self.fileName is None:
return "HtmlParser(direct text)"
else:
return f"HtmlParser({self.fileName})"
def getEventTitle(self):
return self.soup.find("div", class_="eventhead").table.tr.td.contents[0]
def guessDataFromHtmlTitle(self, title=None):
if title is None:
title = self.getEventTitle()
match = re.compile('.*?ETW, Solos (.*?)(?: ".*")?').fullmatch(title)
if match is None:
self.l.error('Cannot parse html title "%s". Possible bug?', title)
raise Exception(f'Cannot parse title "{title}"')
rest = match.group(1)
rawGroup, rawClass, dance = rest.split(" ", 2)
return {
"dance": dance.strip(),
"class_": str(self.classParser.parseClass(rawClass, True)),
"group": str(self.groupParser.parseClass(rawGroup)),
}
def parseResult(self):
participants = {}
def __parseRows(rows, finalist: bool):
def __parseRow(row):
tds = row.find_all("td")
if len(tds) != 2:
return
if tds[1].contents[0].startswith("Alle Starter weiter genommen."):
self.l.info("No excluded starters found.")
return
regex = re.compile("(.*) \\(([0-9]+)\\)")
place = tds[0].contents[0]
match = regex.fullmatch(tds[1].contents[0])
if match is None:
self.l.error("Could not match %s to regex search pattern", str(tds))
raise Exception(f"Could not match {tds} to regex search pattern")
name = match.group(1)
number = match.group(2)
participant = HtmlParticipant(name, number)
participant.finalist = finalist
participants[participant] = place
for row in rows:
__parseRow(row)
def __parseFirstTable(table):
roundName = table.tr.td.contents[0]
if roundName != "Endrunde":
self.l.warning("Found table with round name %s.", roundName)
raise IncompleteRoundException("Could not parse HTML file")
__parseRows(table.find_all("tr")[2:], True)
def __parseRemainingTables(tables):
for table in tables:
__parseRows(table.find_all("tr"), False)
tables = self.soup.find("div", class_="extract").find_all("table")
try:
if len(tables) > 0:
__parseFirstTable(tables[0])
__parseRemainingTables(tables[1:])
except IncompleteRoundException:
pass
# title = self.soup.find('div', class_='eventhead').table.tr.td.contents[0]
# ret = HtmlImport(title, participants)
ret = HtmlResultImport(participants)
return ret
def parsePreparationRound(self):
title = self.soup.find("div", class_="eventhead").table.tr.td.contents[0]
tableData = []
rowTitles = []
def __mapBr(td):
for br in td.find_all("br"):
br.replace_with("\n")
td.smooth()
return td
def __extractTitles(table):
for row in table.find_all("tr")[1:]:
rowTitles.append(__mapBr(row.td).string)
def __extractColumns(table):
content = []
def __extractContent(td):
for br in td.find_all("br"):
br.replace_with("\n")
span = td.span
if span is not None:
span = span.extract()
meta = span.string
else:
meta = None
td.smooth()
return {"text": td.string.replace("\xa0", " ").strip(), "meta": meta}
def __extractRow(row):
entries = []
for entry in row.find_all("td")[1:]:
entries.append(__extractContent(entry))
return entries
for row in table.find_all("tr")[1:]:
content.append(__extractRow(row))
return content
def __mergeColumns(columns1, columns2):
return list(map(lambda x, y: x + y, columns1, columns2))
extract = self.soup.find("div", class_="extract")
tables = extract.find_all("table", class_="tab1")
__extractTitles(tables[0])
tableData = __extractColumns(tables[0])
for table in tables[1:]:
tableData = __mergeColumns(tableData, __extractColumns(table))
data = {"titles": rowTitles, "table": tableData}
return {"title": title, "data": data}
def cleanPreparationRoundImport(self, data):
def __cleanTable(table):
def __cleanText(s: str):
# print("cleaning string ", s)
return s.strip(" \n\xa0")
def __cleanEntry(entry):
entry["text"] = __cleanText(entry["text"])
if entry["meta"] is not None:
entry["meta"] = __cleanText(entry["meta"])
for row in table:
for entry in row:
# print(entry)
__cleanEntry(entry)
data["title"] = data["title"].strip()
__cleanTable(data["data"]["table"])
def parseIndividualResult(self, competitionGroup, competitionClass, dance):
participants = {}
def __parseTable(table):
rows = table.find_all("tr")
def __getIds():
row = rows[1]
entries = row("td")
entries = entries[1:]
entries = [x for x in entries if len(x.contents[0].strip()) > 0]
return [x.contents[0].strip() for x in entries]
ids = __getIds()
numIds = len(ids)
self.l.log(5, "Found ids in dataset: %s", ids)
def findRowIndex(prefixStr):
def isRowMatchingCriteria(row):
if row.td.contents[0].startswith(prefixStr):
return True
return False
l = list(map(isRowMatchingCriteria, rows))
if True not in l:
return None
return l.index(True)
def getPlaces():
placeRowIdx = findRowIndex("Platz von")
placeTags = rows[placeRowIdx]("td")[1 : (numIds + 1)]
def getSinglePlaceStr(tag):
for br in tag("br"):
br.replace_with("-")
tag.smooth()
rawStr = tag.contents[0].strip()
if rawStr.endswith("-"):
rawStr = rawStr[:-1]
return rawStr
places = list(map(getSinglePlaceStr, placeTags))
return places
places = getPlaces()
self.l.log(5, "Found places: %s", places)
def getClass():
classRow = findRowIndex("Startklasse")
if classRow is not None:
classTags = rows[classRow]("td")[1 : (numIds + 1)]
return list(map(lambda x: x.contents[0], classTags))
return None
classes = getClass()
self.l.log(5, "Classes: %s", classes)
def getGroups():
groupRow = findRowIndex("Startgruppe")
if groupRow is not None:
classTags = rows[groupRow]("td")[1 : (numIds + 1)]
return list(map(lambda x: x.contents[0], classTags))
return None
groups = getGroups()
self.l.log(5, "Groups: %s", groups)
for idx, id in enumerate(ids):
cls = classes[idx] if classes is not None else None
grp = groups[idx] if groups is not None else None
tup = (competitionGroup, competitionClass, dance, id)
participants[tup] = (places[idx], cls, grp)
tables = self.soup.find("div", class_="extract").find_all("table")
for table in tables:
__parseTable(table)
return HtmlResultTotalTable(participants)