Parse preview rounds in batch script

2022-11-27 09:10:17 +01:00
parent ffa1e36b6f
commit 9e59ca79d2
4 changed files with 131 additions and 34 deletions
--- a/src/solo_turnier/batch.py
+++ b/src/solo_turnier/batch.py
@@ -177,46 +177,60 @@ class BatchWorker:
        self.l.debug(self.config.__dict__)
        locator = solo_turnier.html_locator.HtmlLocator()
-        self.l.info('Checking for feasible HTML export files in "%s"', self.config.importHtmlPath())
+        self.l.info('Checking for feasible preview HTML export files in "%s"', self.config.importHtmlPath())
-        htmlCandidates = locator.findCandidates(self.config.importHtmlPath())
+        htmlCandidatesPreview = locator.findPreviewRoundCandidates(self.config.importHtmlPath())
-        self.l.debug('Found HTML file candidates: %s', htmlCandidates)
+        self.l.debug('Found HTML file candidates for preview rounds: %s', htmlCandidatesPreview)
-        htmlParser = solo_turnier.html_parser.HtmlParser()
+        previewWorker = solo_turnier.worker.PreviewWorker()
-        htmlImports = []
+        self.l.info('Filtering for pure preview rounds.')
-        self.l.info('Importing the candidates')
+        parsers = previewWorker.filterFilesPreview(htmlCandidatesPreview)
-        for candidate in htmlCandidates:
+        self.l.debug('Remaining files: %s', parsers.keys())
-            self.l.debug('Processing file %s', candidate)
+
-            with open(candidate, 'r') as fp:
+        self.l.info('Extracting person data from the preview rounds.')
-                fileContent = fp.read()
+        previewWorker.extractPersonsFromPreview(parsers)
            htmlImports.append(htmlParser.parseString(fileContent))
        self.l.info('All HTML files have been parsed')
        csvReader = solo_turnier.reader.AllResultReader(self.config.importCSVPath())
        self.l.info('Importing the total result CSV file %s', self.config.importCSVPath())
        csvData = csvReader.readFile()
        self.l.info('CSV file has been read')
        self.l.info('Processing the imported data')
        csvExtractor = solo_turnier.worker.CSVExtractor()
        self.l.debug('Importing CSV data into internal structures')
        csvRows = csvExtractor.mapCSVImport(csvData)
        worker = solo_turnier.worker.DataWorker()
        self.l.debug('Combining results from CSV for individual users')
        data = worker.combineRowsByPerson(csvRows)
        self.l.debug('Fix the groups for combined competitions')
        unambiguous, fixedGroups = worker.consolidateGroups(data)
        if fixedGroups:
            self.l.info('It was required to fix some group issues.')
        if not unambiguous:
            self.warning('There were couples whose group could not be extracted unambiguously.')
        self.l.debug('Merging HTML and CSV data')
        worker.mergeHtmlData(data, htmlImports)
        self.l.info('Data is prepared')
-        consoleOutputtter = solo_turnier.output.ConsoleOutputter()
+        # self.l.info('Checking for feasible HTML export files in "%s"', self.config.importHtmlPath())
-        consoleOutputtter.output(data)
+        # htmlCandidates = locator.findCandidates(self.config.importHtmlPath())
        # self.l.debug('Found HTML file candidates: %s', htmlCandidates)
        # htmlParser = solo_turnier.html_parser.HtmlParser()
        # htmlImports = []
        # self.l.info('Importing the candidates')
        # for candidate in htmlCandidates:
        #     self.l.debug('Processing file %s', candidate)
        #     with open(candidate, 'r') as fp:
        #         fileContent = fp.read()
        #     htmlImports.append(htmlParser.parseString(fileContent))
        # self.l.info('All HTML files have been parsed')
        # self.l.info('Processing the imported data')
        # csvExtractor = solo_turnier.worker.CSVExtractor()
        # self.l.debug('Importing CSV data into internal structures')
        # csvRows = csvExtractor.mapCSVImport(csvData)
        # self.l.debug('Combining results from CSV for individual users')
        # data = worker.combineRowsByPerson(csvRows)
        # self.l.debug('Fix the groups for combined competitions')
        # unambiguous, fixedGroups = worker.consolidateGroups(data)
        # if fixedGroups:
        #     self.l.info('It was required to fix some group issues.')
        # if not unambiguous:
        #     self.warning('There were couples whose group could not be extracted unambiguously.')
        # self.l.debug('Merging HTML and CSV data')
        # worker.mergeHtmlData(data, htmlImports)
        # self.l.info('Data is prepared')
        # consoleOutputtter = solo_turnier.output.ConsoleOutputter()
        # consoleOutputtter.output(data)
    def run1(self):
        allResults, finals = self.__extractDataFromFiles()
--- a/src/solo_turnier/html_parser.py
+++ b/src/solo_turnier/html_parser.py
@@ -170,7 +170,7 @@ class HtmlParser:
    def cleanPreparationRoundImport(self, data):
        def __cleanTable(table):
            def __cleanText(s: str):
-                print("cleaning string ", s)
+                # print("cleaning string ", s)
                return s.strip(' \n\xa0')
            def __cleanEntry(entry):
@@ -180,7 +180,7 @@ class HtmlParser:
            for row in table:
                for entry in row:
-                    print(entry)
+                    # print(entry)
                    __cleanEntry(entry)
        data['title'] = data['title'].strip()
--- a/src/solo_turnier/reader.py
+++ b/src/solo_turnier/reader.py
@@ -4,6 +4,7 @@ import csv
 import os
 import logging
 import re
 from pprint import pformat
 class AllResultReader:
    def __init__(self, fileName: str):
@@ -25,5 +26,6 @@ class AllResultReader:
            'data': rows[1:]
        }
-        logging.getLogger('solo_turnier.reader.all_results').debug('Imported results from allresults.csv file: %s', ret)
+        l = logging.getLogger('solo_turnier.reader.all_results')
        l.log(5, 'Imported results from allresults.csv file: %s', (ret))
        return ret
--- a/src/solo_turnier/worker.py
+++ b/src/solo_turnier/worker.py
@@ -20,6 +20,15 @@ class ResultRow:
    def __str__(self):
        return f'{self.name} ({self.id}, {self.club}) is in {self.group} {self.class_} and danced the {self.dance} in {self.competitionGroup} {self.competitionClass} getting place {self.place}-{self.placeTo}'
 class HtmlPerson:
    def __init__(self, name, id, group):
        self.name = name
        self.id = id
        self.group = group
    def __repr__(self):
        return f'{self.name} ({self.id}, {self.group})'
 class ResultPerson:
    def __init__(self, firstName, lastName, club, id = None, group = None):
        self.firstName = firstName
@@ -148,6 +157,76 @@ class CSVExtractor:
        return ret
 class PreviewWorker:
    def __init__(self):
        self.l = logging.getLogger('solo_turnier.worker.PreviewWorker')
    def filterFilesPreview(self, files: list[str]) -> dict[str, html_parser.HtmlParser]:
        self.l.debug('Filtering the list of parsers by removing all non preview entries.')
        ret = {}
        for file in files:
            with open(file, 'r') as fp:
                text = fp.read()
            parser = html_parser.HtmlParser(text)
            try:
                data = parser.guessDataFromHtmlTitle()
            except:
                self.l.error(f'Unable to parse html file in {file}. Please check manually.')
                continue
            if data['class_'] == 'Sichtung':
                self.l.debug(f"Found candidate in {file}. Adding to the list.")
                ret[file] = parser
            else:
                self.l.debug(f'Rejecting file {file} as the name {data["class_"]} did not match.')
        return ret
    def __extractPersonsFromSinglePreview(self, parser: html_parser.HtmlParser):
        imported = parser.parsePreparationRound()
        parser.cleanPreparationRoundImport(imported)
        data = imported['data']
        self.l.log(5, data)
        if data['titles'][0] != 'Wertungsrichter':
            self.l.fatal('Cannot parse the parsed content of the preview file.')
            raise Exception('Incompatible export file')
        ids = []
        names = []
        indices = []
        for index, e in enumerate(data['table'][0]):
            if e['text'] == '':
                continue
            indices.append(index)
            ids.append(e['text'])
            names.append(e['meta'])
        groups = []
        if data['titles'][-1] == 'Startgruppe':
            self.l.debug('Combined competition found. Extracting group from table')
            groups = [data['table'][-1][idx]['text'] for idx in indices]
        else:
            self.l.debug('Using group from the title.')
            group = parser.guessDataFromHtmlTitle(imported['title'])['group']
            groups = [group for i in indices]
        ret = []
        for i in range(len(ids)):
            ret.append(HtmlPerson(names[i], ids[i], groups[i]))
        self.l.log(5, ret)
        return ret
    def extractPersonsFromPreview(self, parsers):
        for file in parsers:
            self.l.debug('Extracting person data from %s', file)
            self.__extractPersonsFromSinglePreview(parsers[file])
 class DataWorker:
    def __init__(self):
        self.l = logging.getLogger('solo_turnier.worker')
@@ -297,3 +376,5 @@ class DataWorker:
                ret.append(competitions[0])
        return ret