58 lines
1.6 KiB
Python
58 lines
1.6 KiB
Python
|
from bs4 import BeautifulSoup
|
||
|
import bs4
|
||
|
|
||
|
import logging
|
||
|
import re
|
||
|
|
||
|
class HtmlParser:
|
||
|
|
||
|
def __init__(self):
|
||
|
self.l = logging.getLogger('solo_turnier.html_parser')
|
||
|
|
||
|
def parseString(self, text: str):
|
||
|
soup = BeautifulSoup(text, 'html.parser')
|
||
|
|
||
|
ret = {}
|
||
|
|
||
|
def __parseRows(rows, finalist: bool):
|
||
|
def __parseRow(row):
|
||
|
tds = row.find_all('td')
|
||
|
regex = re.compile('(.*) \\(([0-9]+)\\)')
|
||
|
|
||
|
place = tds[0].contents[0]
|
||
|
|
||
|
match = regex.fullmatch(tds[1].contents[0])
|
||
|
if match is None:
|
||
|
raise Exception(f'Could not match {tds} to regex search pattern')
|
||
|
name = match.group(1)
|
||
|
number = match.group(2)
|
||
|
|
||
|
participant = {
|
||
|
'name': name,
|
||
|
'place': place,
|
||
|
'finalist': finalist
|
||
|
}
|
||
|
ret[number] = participant
|
||
|
|
||
|
for row in rows:
|
||
|
__parseRow(row)
|
||
|
|
||
|
def __parseFirstTable(table):
|
||
|
roundName = table.tr.td.contents[0]
|
||
|
if roundName != 'Endrunde':
|
||
|
raise Exception('Could not parse HTML file')
|
||
|
|
||
|
__parseRows(table.find_all('tr')[2:], True)
|
||
|
|
||
|
def __parseRemainingTables(tables):
|
||
|
for table in tables:
|
||
|
__parseRows(table.find_all('tr')[2:], False)
|
||
|
|
||
|
tables = soup.find('div', class_='extract').find_all('table')
|
||
|
if len(tables) > 0:
|
||
|
__parseFirstTable(tables[0])
|
||
|
|
||
|
__parseRemainingTables(tables[1:])
|
||
|
|
||
|
return ret
|