Create tested HTML parser for result tables

This commit is contained in:
Christian Wolf 2022-11-13 17:01:44 +01:00
parent 4493fa09ea
commit 0d978221f1
10 changed files with 318 additions and 2 deletions

View File

@ -1,11 +1,17 @@
{
"folders": [
{
"path": "."
"path": ".",
"name": "code"
},
{
"path": "../../../../../nextcloud/Documents/Projekte/SLT/Auswertungsskript Solo"
}
],
"settings": {}
"settings": {
"python.testing.pytestEnabled": false,
"python.autoComplete.extraPaths": [
"${workspaceFolder:code}/venv/lib"
]
}
}

View File

@ -1 +1,3 @@
beautifulsoup4==4.11.1
soupsieve==2.3.2.post1
tabulate==0.9.0

1
src/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
/.coverage

View File

@ -0,0 +1,57 @@
from bs4 import BeautifulSoup
import bs4
import logging
import re
class HtmlParser:
def __init__(self):
self.l = logging.getLogger('solo_turnier.html_parser')
def parseString(self, text: str):
soup = BeautifulSoup(text, 'html.parser')
ret = {}
def __parseRows(rows, finalist: bool):
def __parseRow(row):
tds = row.find_all('td')
regex = re.compile('(.*) \\(([0-9]+)\\)')
place = tds[0].contents[0]
match = regex.fullmatch(tds[1].contents[0])
if match is None:
raise Exception(f'Could not match {tds} to regex search pattern')
name = match.group(1)
number = match.group(2)
participant = {
'name': name,
'place': place,
'finalist': finalist
}
ret[number] = participant
for row in rows:
__parseRow(row)
def __parseFirstTable(table):
roundName = table.tr.td.contents[0]
if roundName != 'Endrunde':
raise Exception('Could not parse HTML file')
__parseRows(table.find_all('tr')[2:], True)
def __parseRemainingTables(tables):
for table in tables:
__parseRows(table.find_all('tr')[2:], False)
tables = soup.find('div', class_='extract').find_all('table')
if len(tables) > 0:
__parseFirstTable(tables[0])
__parseRemainingTables(tables[1:])
return ret

View File

View File

@ -0,0 +1,85 @@
<!DOCTYPE html PUBLIC "-//WAPFORUM//DTD XHTML Mobile 1.2//EN" "http://www.openmobilealliance.org/tech/DTD/xhtml-mobile12.dtd">
<HTML>
<HEAD>
<META http-equiv="Content-Type" content="text/html; charset=utf-8">
<META name="Author" content="Saarländischer Landesverband für Tanzsport">
<META name="GENERATOR" content="TopTurnierDigital">
<meta http-equiv="Cache-Control" content="no-cache, no-store, must-revalidate" />
<meta http-equiv="Pragma" content="no-cache" />
<meta http-equiv="Expires" content="0" />
<TITLE>09.07.2022 Jun. Newc./Beg. Rumba</TITLE>
<link rel="stylesheet" type="text/css" href="topturnier.css">
<meta name="viewport" content="width=device-width, initial-scale=1" />
</HEAD>
<body><div class="mainback">
<div class="eventhead">
<table border=0 width=100%>
<tr><td>09.07.2022 - ETW, Solos Jun. Newc./Beg. Rumba</td><td width=30>
<a class="backbtn" href="index.htm" target="_top">&equiv;</a>
</td></tr>
</table>
</div>
<div class="maincontainer">
<div class="comphead">Ergebnis</div>
<hr class="line">
<div class="extract">
<TABLE class="tab1">
<TR>
<TD class="td1" colspan="2">Endrunde</TD>
</TR>
<TR>
<TD class="td2c" width="45px">Platz</TD>
<TD class="td2">Teilnehmer</TD>
</TR>
<TR>
<TD class="td3r">1.</TD>
<TD class="td5">Max Mustermann 1 (14)</TD>
</TR>
<TR>
<TD class="td3r">2.</TD>
<TD class="td5">Max Mustermann 2 (9)</TD>
</TR>
<TR>
<TD class="td3r">3.</TD>
<TD class="td5">Max Mustermann 3 (13)</TD>
</TR>
<TR>
<TD class="td3r">4.</TD>
<TD class="td5">Max Mustermann 4 (17)</TD>
</TR>
<TR>
<TD class="td3r">5.</TD>
<TD class="td5">Max Mustermann 5 (6)</TD>
</TR>
<TR>
<TD class="td3r">6.</TD>
<TD class="td5">Max Mustermann 6 (27)</TD>
</TR>
<TR>
<TD class="td3r">7.</TD>
<TD class="td5">Max Mustermann 7 (22)</TD>
</TR>
</TABLE>
<TABLE class="tab2" border=0>
<TR style="height: 5px; border:0;">
<TD class="td5" colspan=2 style="height: 5px; border:0;"></TD>
</TR>
<TR>
<TD class="td1" colspan=2>Vorrunde</TD>
</TR>
<TR>
<TD class="td3r" width="45px">8.</TD>
<TD class="td5">Max Mustermann 8 (26)</TD>
</TR>
<TR>
<TD class="td3r" width="45px">9.</TD>
<TD class="td5">Max Mustermann 9 (25)</TD>
</TR>
</TABLE>
<br>
</div>
</div>
</div>
<P><FONT size="1" face="Arial">Diese Liste wurde mit <A TARGET="_blank" HREF="http://www.TopTurnier.de">TopTurnier f&uuml;r Windows V9.3</A> erstellt.<br></FONT>
</body>
</HTML>

View File

@ -0,0 +1,47 @@
{
"14": {
"name": "Max Mustermann 1",
"place": "1.",
"finalist": true
},
"9": {
"name": "Max Mustermann 2",
"place": "2.",
"finalist": true
},
"13": {
"name": "Max Mustermann 3",
"place": "3.",
"finalist": true
},
"17": {
"name": "Max Mustermann 4",
"place": "4.",
"finalist": true
},
"6": {
"name": "Max Mustermann 5",
"place": "5.",
"finalist": true
},
"27": {
"name": "Max Mustermann 6",
"place": "6.",
"finalist": true
},
"22": {
"name": "Max Mustermann 7",
"place": "7.",
"finalist": true
},
"26": {
"name": "Max Mustermann 8",
"place": "8.",
"finalist": false
},
"25": {
"name": "Max Mustermann 9",
"place": "9.",
"finalist": false
}
}

View File

@ -0,0 +1,63 @@
<!DOCTYPE html PUBLIC "-//WAPFORUM//DTD XHTML Mobile 1.2//EN" "http://www.openmobilealliance.org/tech/DTD/xhtml-mobile12.dtd">
<HTML>
<HEAD>
<META http-equiv="Content-Type" content="text/html; charset=utf-8">
<META name="Author" content="Saarländischer Landesverband für Tanzsport">
<META name="GENERATOR" content="TopTurnierDigital">
<meta http-equiv="Cache-Control" content="no-cache, no-store, must-revalidate" />
<meta http-equiv="Pragma" content="no-cache" />
<meta http-equiv="Expires" content="0" />
<TITLE>09.07.2022 Jun. Beginner Jive</TITLE>
<link rel="stylesheet" type="text/css" href="topturnier.css">
<meta name="viewport" content="width=device-width, initial-scale=1" />
</HEAD>
<body><div class="mainback">
<div class="eventhead">
<table border=0 width=100%>
<tr><td>09.07.2022 - ETW, Solos Jun. Beginner Jive</td><td width=30>
<a class="backbtn" href="index.htm" target="_top">&equiv;</a>
</td></tr>
</table>
</div>
<div class="maincontainer">
<div class="comphead">Ergebnis</div>
<hr class="line">
<div class="extract">
<TABLE class="tab1">
<TR>
<TD class="td1" colspan="2">Endrunde</TD>
</TR>
<TR>
<TD class="td2c" width="45px">Platz</TD>
<TD class="td2">Teilnehmer</TD>
</TR>
<TR>
<TD class="td3r">1.</TD>
<TD class="td5">Maxime Musterfrau 1 (14)</TD>
</TR>
<TR>
<TD class="td3r">2.</TD>
<TD class="td5">Maxime Musterfrau 2 (13)</TD>
</TR>
<TR>
<TD class="td3r">3.</TD>
<TD class="td5">Maxime Musterfrau 3 (17)</TD>
</TR>
<TR>
<TD class="td3r">4.</TD>
<TD class="td5">Maxime Musterfrau 4 (6)</TD>
</TR>
<TR>
<TD class="td3r">5.</TD>
<TD class="td5">Maxime Musterfrau 5 (22)</TD>
</TR>
</TABLE>
<TABLE class="tab2" border=0>
</TABLE>
<br>
</div>
</div>
</div>
<P><FONT size="1" face="Arial">Diese Liste wurde mit <A TARGET="_blank" HREF="http://www.TopTurnier.de">TopTurnier f&uuml;r Windows V9.3</A> erstellt.<br></FONT>
</body>
</HTML>

View File

@ -0,0 +1,27 @@
{
"14": {
"name": "Maxime Musterfrau 1",
"place": "1.",
"finalist": true
},
"13": {
"name": "Maxime Musterfrau 2",
"place": "2.",
"finalist": true
},
"17": {
"name": "Maxime Musterfrau 3",
"place": "3.",
"finalist": true
},
"6": {
"name": "Maxime Musterfrau 4",
"place": "4.",
"finalist": true
},
"22": {
"name": "Maxime Musterfrau 5",
"place": "5.",
"finalist": true
}
}

View File

@ -0,0 +1,28 @@
import pytest
import os
import json
import solo_turnier.html_parser
@pytest.fixture(scope='module', params=["1", '2'])
def dataProviderHtmlParser(request):
variant = request.param
dir = os.path.join(os.path.dirname(__file__), 'html_parser', variant)
htmlFile = os.path.join(dir, 'erg.htm')
jsonFile = os.path.join(dir, 'expected.json')
with open(htmlFile, 'r') as fp:
html = fp.read()
with open(jsonFile, 'r') as fp:
jsonContent = json.load(fp)
return (html, jsonContent)
def test_extractDataFromHtml(dataProviderHtmlParser):
htmlString = dataProviderHtmlParser[0]
expected = dataProviderHtmlParser[1]
parser = solo_turnier.html_parser.HtmlParser()
actualResult = parser.parseString(htmlString)
assert actualResult == expected