Create tested HTML parser for result tables
This commit is contained in:
parent
4493fa09ea
commit
0d978221f1
@ -1,11 +1,17 @@
|
||||
{
|
||||
"folders": [
|
||||
{
|
||||
"path": "."
|
||||
"path": ".",
|
||||
"name": "code"
|
||||
},
|
||||
{
|
||||
"path": "../../../../../nextcloud/Documents/Projekte/SLT/Auswertungsskript Solo"
|
||||
}
|
||||
],
|
||||
"settings": {}
|
||||
"settings": {
|
||||
"python.testing.pytestEnabled": false,
|
||||
"python.autoComplete.extraPaths": [
|
||||
"${workspaceFolder:code}/venv/lib"
|
||||
]
|
||||
}
|
||||
}
|
||||
|
@ -1 +1,3 @@
|
||||
beautifulsoup4==4.11.1
|
||||
soupsieve==2.3.2.post1
|
||||
tabulate==0.9.0
|
||||
|
1
src/.gitignore
vendored
Normal file
1
src/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
/.coverage
|
57
src/solo_turnier/html_parser.py
Normal file
57
src/solo_turnier/html_parser.py
Normal file
@ -0,0 +1,57 @@
|
||||
from bs4 import BeautifulSoup
|
||||
import bs4
|
||||
|
||||
import logging
|
||||
import re
|
||||
|
||||
class HtmlParser:
|
||||
|
||||
def __init__(self):
|
||||
self.l = logging.getLogger('solo_turnier.html_parser')
|
||||
|
||||
def parseString(self, text: str):
|
||||
soup = BeautifulSoup(text, 'html.parser')
|
||||
|
||||
ret = {}
|
||||
|
||||
def __parseRows(rows, finalist: bool):
|
||||
def __parseRow(row):
|
||||
tds = row.find_all('td')
|
||||
regex = re.compile('(.*) \\(([0-9]+)\\)')
|
||||
|
||||
place = tds[0].contents[0]
|
||||
|
||||
match = regex.fullmatch(tds[1].contents[0])
|
||||
if match is None:
|
||||
raise Exception(f'Could not match {tds} to regex search pattern')
|
||||
name = match.group(1)
|
||||
number = match.group(2)
|
||||
|
||||
participant = {
|
||||
'name': name,
|
||||
'place': place,
|
||||
'finalist': finalist
|
||||
}
|
||||
ret[number] = participant
|
||||
|
||||
for row in rows:
|
||||
__parseRow(row)
|
||||
|
||||
def __parseFirstTable(table):
|
||||
roundName = table.tr.td.contents[0]
|
||||
if roundName != 'Endrunde':
|
||||
raise Exception('Could not parse HTML file')
|
||||
|
||||
__parseRows(table.find_all('tr')[2:], True)
|
||||
|
||||
def __parseRemainingTables(tables):
|
||||
for table in tables:
|
||||
__parseRows(table.find_all('tr')[2:], False)
|
||||
|
||||
tables = soup.find('div', class_='extract').find_all('table')
|
||||
if len(tables) > 0:
|
||||
__parseFirstTable(tables[0])
|
||||
|
||||
__parseRemainingTables(tables[1:])
|
||||
|
||||
return ret
|
0
src/solo_turnier/tests/__init__.py
Normal file
0
src/solo_turnier/tests/__init__.py
Normal file
85
src/solo_turnier/tests/html_parser/1/erg.htm
Normal file
85
src/solo_turnier/tests/html_parser/1/erg.htm
Normal file
@ -0,0 +1,85 @@
|
||||
<!DOCTYPE html PUBLIC "-//WAPFORUM//DTD XHTML Mobile 1.2//EN" "http://www.openmobilealliance.org/tech/DTD/xhtml-mobile12.dtd">
|
||||
<HTML>
|
||||
<HEAD>
|
||||
<META http-equiv="Content-Type" content="text/html; charset=utf-8">
|
||||
<META name="Author" content="Saarländischer Landesverband für Tanzsport">
|
||||
<META name="GENERATOR" content="TopTurnierDigital">
|
||||
<meta http-equiv="Cache-Control" content="no-cache, no-store, must-revalidate" />
|
||||
<meta http-equiv="Pragma" content="no-cache" />
|
||||
<meta http-equiv="Expires" content="0" />
|
||||
<TITLE>09.07.2022 Jun. Newc./Beg. Rumba</TITLE>
|
||||
<link rel="stylesheet" type="text/css" href="topturnier.css">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
</HEAD>
|
||||
<body><div class="mainback">
|
||||
<div class="eventhead">
|
||||
<table border=0 width=100%>
|
||||
<tr><td>09.07.2022 - ETW, Solos Jun. Newc./Beg. Rumba</td><td width=30>
|
||||
<a class="backbtn" href="index.htm" target="_top">≡</a>
|
||||
</td></tr>
|
||||
</table>
|
||||
</div>
|
||||
<div class="maincontainer">
|
||||
<div class="comphead">Ergebnis</div>
|
||||
<hr class="line">
|
||||
<div class="extract">
|
||||
<TABLE class="tab1">
|
||||
<TR>
|
||||
<TD class="td1" colspan="2">Endrunde</TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD class="td2c" width="45px">Platz</TD>
|
||||
<TD class="td2">Teilnehmer</TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD class="td3r">1.</TD>
|
||||
<TD class="td5">Max Mustermann 1 (14)</TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD class="td3r">2.</TD>
|
||||
<TD class="td5">Max Mustermann 2 (9)</TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD class="td3r">3.</TD>
|
||||
<TD class="td5">Max Mustermann 3 (13)</TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD class="td3r">4.</TD>
|
||||
<TD class="td5">Max Mustermann 4 (17)</TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD class="td3r">5.</TD>
|
||||
<TD class="td5">Max Mustermann 5 (6)</TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD class="td3r">6.</TD>
|
||||
<TD class="td5">Max Mustermann 6 (27)</TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD class="td3r">7.</TD>
|
||||
<TD class="td5">Max Mustermann 7 (22)</TD>
|
||||
</TR>
|
||||
</TABLE>
|
||||
<TABLE class="tab2" border=0>
|
||||
<TR style="height: 5px; border:0;">
|
||||
<TD class="td5" colspan=2 style="height: 5px; border:0;"></TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD class="td1" colspan=2>Vorrunde</TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD class="td3r" width="45px">8.</TD>
|
||||
<TD class="td5">Max Mustermann 8 (26)</TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD class="td3r" width="45px">9.</TD>
|
||||
<TD class="td5">Max Mustermann 9 (25)</TD>
|
||||
</TR>
|
||||
</TABLE>
|
||||
<br>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<P><FONT size="1" face="Arial">Diese Liste wurde mit <A TARGET="_blank" HREF="http://www.TopTurnier.de">TopTurnier für Windows V9.3</A> erstellt.<br></FONT>
|
||||
</body>
|
||||
</HTML>
|
47
src/solo_turnier/tests/html_parser/1/expected.json
Normal file
47
src/solo_turnier/tests/html_parser/1/expected.json
Normal file
@ -0,0 +1,47 @@
|
||||
{
|
||||
"14": {
|
||||
"name": "Max Mustermann 1",
|
||||
"place": "1.",
|
||||
"finalist": true
|
||||
},
|
||||
"9": {
|
||||
"name": "Max Mustermann 2",
|
||||
"place": "2.",
|
||||
"finalist": true
|
||||
},
|
||||
"13": {
|
||||
"name": "Max Mustermann 3",
|
||||
"place": "3.",
|
||||
"finalist": true
|
||||
},
|
||||
"17": {
|
||||
"name": "Max Mustermann 4",
|
||||
"place": "4.",
|
||||
"finalist": true
|
||||
},
|
||||
"6": {
|
||||
"name": "Max Mustermann 5",
|
||||
"place": "5.",
|
||||
"finalist": true
|
||||
},
|
||||
"27": {
|
||||
"name": "Max Mustermann 6",
|
||||
"place": "6.",
|
||||
"finalist": true
|
||||
},
|
||||
"22": {
|
||||
"name": "Max Mustermann 7",
|
||||
"place": "7.",
|
||||
"finalist": true
|
||||
},
|
||||
"26": {
|
||||
"name": "Max Mustermann 8",
|
||||
"place": "8.",
|
||||
"finalist": false
|
||||
},
|
||||
"25": {
|
||||
"name": "Max Mustermann 9",
|
||||
"place": "9.",
|
||||
"finalist": false
|
||||
}
|
||||
}
|
63
src/solo_turnier/tests/html_parser/2/erg.htm
Normal file
63
src/solo_turnier/tests/html_parser/2/erg.htm
Normal file
@ -0,0 +1,63 @@
|
||||
<!DOCTYPE html PUBLIC "-//WAPFORUM//DTD XHTML Mobile 1.2//EN" "http://www.openmobilealliance.org/tech/DTD/xhtml-mobile12.dtd">
|
||||
<HTML>
|
||||
<HEAD>
|
||||
<META http-equiv="Content-Type" content="text/html; charset=utf-8">
|
||||
<META name="Author" content="Saarländischer Landesverband für Tanzsport">
|
||||
<META name="GENERATOR" content="TopTurnierDigital">
|
||||
<meta http-equiv="Cache-Control" content="no-cache, no-store, must-revalidate" />
|
||||
<meta http-equiv="Pragma" content="no-cache" />
|
||||
<meta http-equiv="Expires" content="0" />
|
||||
<TITLE>09.07.2022 Jun. Beginner Jive</TITLE>
|
||||
<link rel="stylesheet" type="text/css" href="topturnier.css">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
</HEAD>
|
||||
<body><div class="mainback">
|
||||
<div class="eventhead">
|
||||
<table border=0 width=100%>
|
||||
<tr><td>09.07.2022 - ETW, Solos Jun. Beginner Jive</td><td width=30>
|
||||
<a class="backbtn" href="index.htm" target="_top">≡</a>
|
||||
</td></tr>
|
||||
</table>
|
||||
</div>
|
||||
<div class="maincontainer">
|
||||
<div class="comphead">Ergebnis</div>
|
||||
<hr class="line">
|
||||
<div class="extract">
|
||||
<TABLE class="tab1">
|
||||
<TR>
|
||||
<TD class="td1" colspan="2">Endrunde</TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD class="td2c" width="45px">Platz</TD>
|
||||
<TD class="td2">Teilnehmer</TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD class="td3r">1.</TD>
|
||||
<TD class="td5">Maxime Musterfrau 1 (14)</TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD class="td3r">2.</TD>
|
||||
<TD class="td5">Maxime Musterfrau 2 (13)</TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD class="td3r">3.</TD>
|
||||
<TD class="td5">Maxime Musterfrau 3 (17)</TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD class="td3r">4.</TD>
|
||||
<TD class="td5">Maxime Musterfrau 4 (6)</TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD class="td3r">5.</TD>
|
||||
<TD class="td5">Maxime Musterfrau 5 (22)</TD>
|
||||
</TR>
|
||||
</TABLE>
|
||||
<TABLE class="tab2" border=0>
|
||||
</TABLE>
|
||||
<br>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<P><FONT size="1" face="Arial">Diese Liste wurde mit <A TARGET="_blank" HREF="http://www.TopTurnier.de">TopTurnier für Windows V9.3</A> erstellt.<br></FONT>
|
||||
</body>
|
||||
</HTML>
|
27
src/solo_turnier/tests/html_parser/2/expected.json
Normal file
27
src/solo_turnier/tests/html_parser/2/expected.json
Normal file
@ -0,0 +1,27 @@
|
||||
{
|
||||
"14": {
|
||||
"name": "Maxime Musterfrau 1",
|
||||
"place": "1.",
|
||||
"finalist": true
|
||||
},
|
||||
"13": {
|
||||
"name": "Maxime Musterfrau 2",
|
||||
"place": "2.",
|
||||
"finalist": true
|
||||
},
|
||||
"17": {
|
||||
"name": "Maxime Musterfrau 3",
|
||||
"place": "3.",
|
||||
"finalist": true
|
||||
},
|
||||
"6": {
|
||||
"name": "Maxime Musterfrau 4",
|
||||
"place": "4.",
|
||||
"finalist": true
|
||||
},
|
||||
"22": {
|
||||
"name": "Maxime Musterfrau 5",
|
||||
"place": "5.",
|
||||
"finalist": true
|
||||
}
|
||||
}
|
28
src/solo_turnier/tests/test_html_parser.py
Normal file
28
src/solo_turnier/tests/test_html_parser.py
Normal file
@ -0,0 +1,28 @@
|
||||
import pytest
|
||||
import os
|
||||
import json
|
||||
|
||||
import solo_turnier.html_parser
|
||||
|
||||
@pytest.fixture(scope='module', params=["1", '2'])
|
||||
def dataProviderHtmlParser(request):
|
||||
variant = request.param
|
||||
dir = os.path.join(os.path.dirname(__file__), 'html_parser', variant)
|
||||
htmlFile = os.path.join(dir, 'erg.htm')
|
||||
jsonFile = os.path.join(dir, 'expected.json')
|
||||
|
||||
with open(htmlFile, 'r') as fp:
|
||||
html = fp.read()
|
||||
with open(jsonFile, 'r') as fp:
|
||||
jsonContent = json.load(fp)
|
||||
|
||||
return (html, jsonContent)
|
||||
|
||||
def test_extractDataFromHtml(dataProviderHtmlParser):
|
||||
htmlString = dataProviderHtmlParser[0]
|
||||
expected = dataProviderHtmlParser[1]
|
||||
|
||||
parser = solo_turnier.html_parser.HtmlParser()
|
||||
actualResult = parser.parseString(htmlString)
|
||||
|
||||
assert actualResult == expected
|
Loading…
Reference in New Issue
Block a user