solo-auswertung/src/solo_turnier/html_parser.py

196 lines
5.9 KiB
Python
Raw Normal View History

from bs4 import BeautifulSoup
import logging
import re
2022-11-15 13:38:59 +00:00
class HtmlParticipant:
def __init__(self, name, place, finalist):
self.name = name
self.place = place
self.finalist = finalist
def __str__(self):
return f'{self.name} (with place {self.place})'
2022-11-15 13:38:59 +00:00
class HtmlImport:
def __init__(self, title: str, participants: dict[int, HtmlParticipant]):
self.title = title
self.participants = participants
class HtmlParser:
2022-12-03 13:29:01 +00:00
def __init__(self, text: str, fileName: str = None):
self.l = logging.getLogger('solo_turnier.html_parser')
2022-11-26 07:43:15 +00:00
self.soup = BeautifulSoup(text, 'html.parser')
2022-12-03 13:29:01 +00:00
self.fileName = fileName
def __repr__(self):
if self.fileName is None:
return 'HtmlParser(direct text)'
else:
return f'HtmlParser({self.fileName})'
2022-11-26 07:43:15 +00:00
def getEventTitle(self):
return self.soup.find('div', class_='eventhead').table.tr.td.contents[0]
def guessDataFromHtmlTitle(self, title = None):
if title is None:
title = self.getEventTitle()
match = re.compile('.*?ETW, Solos (.*)').match(title)
if match is None:
raise Exception(f'Cannot parse title "{title}"')
rest = match.group(1)
rawGroup, rawClass, dance = rest.split(' ', 2)
classMap = {
'Newcomer': 'Newc.',
'Beginner': 'Beg.',
'Advanced': 'Adv.'
}
groupMap = {
'Kinder': 'Kin.',
'Junioren': 'Jun.',
'Jugend': 'Jug.',
}
return {
'dance': dance.strip(),
'class_': classMap.get(rawClass, rawClass),
'group': groupMap.get(rawGroup, rawGroup)
}
def parseString(self, text: str):
soup = BeautifulSoup(text, 'html.parser')
participants = {}
def __parseRows(rows, finalist: bool):
def __parseRow(row):
tds = row.find_all('td')
regex = re.compile('(.*) \\(([0-9]+)\\)')
place = tds[0].contents[0]
match = regex.fullmatch(tds[1].contents[0])
if match is None:
raise Exception(f'Could not match {tds} to regex search pattern')
name = match.group(1)
number = match.group(2)
2022-11-15 13:38:59 +00:00
participant = HtmlParticipant(name, place, finalist)
participants[number] = participant
for row in rows:
__parseRow(row)
def __parseFirstTable(table):
roundName = table.tr.td.contents[0]
if roundName != 'Endrunde':
raise Exception('Could not parse HTML file')
__parseRows(table.find_all('tr')[2:], True)
def __parseRemainingTables(tables):
for table in tables:
__parseRows(table.find_all('tr')[2:], False)
tables = soup.find('div', class_='extract').find_all('table')
if len(tables) > 0:
__parseFirstTable(tables[0])
__parseRemainingTables(tables[1:])
title = soup.find('div', class_='eventhead').table.tr.td.contents[0]
2022-11-15 13:38:59 +00:00
ret = HtmlImport(title, participants)
return ret
2022-11-15 13:19:10 +00:00
2022-11-26 07:43:15 +00:00
def parsePreparationRound(self):
title = self.soup.find('div', class_='eventhead').table.tr.td.contents[0]
2022-11-19 06:38:22 +00:00
tableData = []
rowTitles = []
def __mapBr(td):
for br in td.find_all('br'):
br.replace_with('\n')
td.smooth()
return td
def __extractTitles(table):
for row in table.find_all('tr')[1:]:
rowTitles.append(__mapBr(row.td).string)
def __extractColumns(table):
content = []
def __extractContent(td):
for br in td.find_all('br'):
br.replace_with('\n')
span = td.span
if span is not None:
span = span.extract()
meta = span.string
else:
meta = None
td.smooth()
return {
2022-11-26 07:43:46 +00:00
'text': td.string.replace('\xa0', ' ').strip(),
2022-11-19 06:38:22 +00:00
'meta': meta
}
def __extractRow(row):
entries = []
for entry in row.find_all('td')[1:]:
entries.append(__extractContent(entry))
return entries
for row in table.find_all('tr')[1:]:
content.append(__extractRow(row))
return content
def __mergeColumns(columns1, columns2):
return list(map(lambda x, y: x + y, columns1, columns2))
2022-11-26 07:43:15 +00:00
extract = self.soup.find('div', class_='extract')
2022-11-19 06:38:22 +00:00
tables = extract.find_all('table', class_='tab1')
__extractTitles(tables[0])
tableData = __extractColumns(tables[0])
for table in tables[1:]:
tableData = __mergeColumns(tableData, __extractColumns(table))
data = {
'titles': rowTitles,
'table': tableData
}
return {'title': title, 'data': data}
def cleanPreparationRoundImport(self, data):
def __cleanTable(table):
def __cleanText(s: str):
2022-11-27 08:10:17 +00:00
# print("cleaning string ", s)
2022-11-19 06:38:22 +00:00
return s.strip(' \n\xa0')
def __cleanEntry(entry):
entry['text'] = __cleanText(entry['text'])
if entry['meta'] is not None:
entry['meta'] = __cleanText(entry['meta'])
for row in table:
for entry in row:
2022-11-27 08:10:17 +00:00
# print(entry)
2022-11-19 06:38:22 +00:00
__cleanEntry(entry)
data['title'] = data['title'].strip()
__cleanTable(data['data']['table'])