solo-auswertung/src/solo_turnier/html_parser.py

181 lines
5.6 KiB
Python
Raw Normal View History

from bs4 import BeautifulSoup
import logging
import re
from .types import HtmlPreviewParticipant, HtmlParticipant
from .types import HtmlPreviewImport as HtmlImport, HtmlResultImport
from .group import GroupParser
from .competition_class import CompetitionClassParser
2022-11-15 13:38:59 +00:00
class HtmlParser:
2022-12-03 13:29:01 +00:00
def __init__(self, text: str, fileName: str = None):
self.l = logging.getLogger('solo_turnier.html_parser')
2022-11-26 07:43:15 +00:00
self.soup = BeautifulSoup(text, 'html.parser')
2022-12-03 13:29:01 +00:00
self.fileName = fileName
self.groupParser = GroupParser()
self.classParser = CompetitionClassParser()
2022-12-03 13:29:01 +00:00
def __repr__(self):
if self.fileName is None:
return 'HtmlParser(direct text)'
else:
return f'HtmlParser({self.fileName})'
2022-11-26 07:43:15 +00:00
def getEventTitle(self):
return self.soup.find('div', class_='eventhead').table.tr.td.contents[0]
def guessDataFromHtmlTitle(self, title = None):
if title is None:
title = self.getEventTitle()
match = re.compile('.*?ETW, Solos (.*)').match(title)
if match is None:
raise Exception(f'Cannot parse title "{title}"')
rest = match.group(1)
rawGroup, rawClass, dance = rest.split(' ', 2)
return {
'dance': dance.strip(),
'class_': str(self.classParser.parseClass(rawClass, True)),
'group': str(self.groupParser.parseClass(rawGroup))
2022-11-26 07:43:15 +00:00
}
def parseResult(self):
participants = {}
def __parseRows(rows, finalist: bool):
def __parseRow(row):
tds = row.find_all('td')
if len(tds) != 2:
return
regex = re.compile('(.*) \\(([0-9]+)\\)')
place = tds[0].contents[0]
match = regex.fullmatch(tds[1].contents[0])
if match is None:
raise Exception(f'Could not match {tds} to regex search pattern')
name = match.group(1)
number = match.group(2)
participant = HtmlParticipant(name, number)
participant.finalist = finalist
participants[participant] = place
for row in rows:
__parseRow(row)
def __parseFirstTable(table):
roundName = table.tr.td.contents[0]
if roundName != 'Endrunde':
raise Exception('Could not parse HTML file')
__parseRows(table.find_all('tr')[2:], True)
def __parseRemainingTables(tables):
for table in tables:
__parseRows(table.find_all('tr'), False)
tables = self.soup.find('div', class_='extract').find_all('table')
if len(tables) > 0:
__parseFirstTable(tables[0])
__parseRemainingTables(tables[1:])
# title = self.soup.find('div', class_='eventhead').table.tr.td.contents[0]
# ret = HtmlImport(title, participants)
ret = HtmlResultImport(participants)
return ret
2022-11-15 13:19:10 +00:00
2022-11-26 07:43:15 +00:00
def parsePreparationRound(self):
title = self.soup.find('div', class_='eventhead').table.tr.td.contents[0]
2022-11-19 06:38:22 +00:00
tableData = []
rowTitles = []
def __mapBr(td):
for br in td.find_all('br'):
br.replace_with('\n')
td.smooth()
return td
def __extractTitles(table):
for row in table.find_all('tr')[1:]:
rowTitles.append(__mapBr(row.td).string)
def __extractColumns(table):
content = []
def __extractContent(td):
for br in td.find_all('br'):
br.replace_with('\n')
span = td.span
if span is not None:
span = span.extract()
meta = span.string
else:
meta = None
td.smooth()
return {
2022-11-26 07:43:46 +00:00
'text': td.string.replace('\xa0', ' ').strip(),
2022-11-19 06:38:22 +00:00
'meta': meta
}
def __extractRow(row):
entries = []
for entry in row.find_all('td')[1:]:
entries.append(__extractContent(entry))
return entries
for row in table.find_all('tr')[1:]:
content.append(__extractRow(row))
return content
def __mergeColumns(columns1, columns2):
return list(map(lambda x, y: x + y, columns1, columns2))
2022-11-26 07:43:15 +00:00
extract = self.soup.find('div', class_='extract')
2022-11-19 06:38:22 +00:00
tables = extract.find_all('table', class_='tab1')
__extractTitles(tables[0])
tableData = __extractColumns(tables[0])
for table in tables[1:]:
tableData = __mergeColumns(tableData, __extractColumns(table))
data = {
'titles': rowTitles,
'table': tableData
}
return {'title': title, 'data': data}
def cleanPreparationRoundImport(self, data):
def __cleanTable(table):
def __cleanText(s: str):
2022-11-27 08:10:17 +00:00
# print("cleaning string ", s)
2022-11-19 06:38:22 +00:00
return s.strip(' \n\xa0')
def __cleanEntry(entry):
entry['text'] = __cleanText(entry['text'])
if entry['meta'] is not None:
entry['meta'] = __cleanText(entry['meta'])
for row in table:
for entry in row:
2022-11-27 08:10:17 +00:00
# print(entry)
2022-11-19 06:38:22 +00:00
__cleanEntry(entry)
data['title'] = data['title'].strip()
__cleanTable(data['data']['table'])