Create a basic code snippet to parse mails automatically

Start of #27
2024-01-14 20:41:17 +01:00 · 2024-01-14 20:41:17 +01:00 · 55e3efb3a8
commit 55e3efb3a8
parent 05285d29d4
12 changed files with 442 additions and 0 deletions
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -0,0 +1,24 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python: Remote Attach",
+            "type": "python",
+            "request": "attach",
+            "connect": {
+                "host": "localhost",
+                "port": 5678
+            },
+            "pathMappings": [
+                // {
+                //     "localRoot": "${workspaceFolder}",
+                //     "remoteRoot": "."
+                // }
+            ],
+            "justMyCode": true
+        }
+    ]
+}
--- a/scripts/read-competition-notification/.gitignore
+++ b/scripts/read-competition-notification/.gitignore
@ -0,0 +1 @@
+__pycache__/
--- a/scripts/read-competition-notification/competitionNotificationReader/init.py
+++ b/scripts/read-competition-notification/competitionNotificationReader/init.py
@ -0,0 +1,50 @@
+from . import cli
+from . import mail
+from . import headerExtractor
+from . import mailParser
+from . import competitionParser
+from . import mboxReader
+
+import logging
+import debugpy
+import os
+
+def main():
+    args = cli.getArgs()
+
+    logging.basicConfig()
+    logger = logging.getLogger(__name__)
+
+    verbosityMap = {
+        0: logging.WARNING,
+        1: logging.INFO,
+    }
+    rootLogger = logging.getLogger()
+    rootLogger.setLevel(verbosityMap.get(args.verbose, logging.DEBUG))
+
+    if args.debug:
+        debugpy.listen(5678)
+        debugpy.wait_for_client()
+
+    mp = mailParser.MailParser()
+    cp = competitionParser.CompetitionParser()
+
+    if args.read_mbox is not None:
+        if args.output_folder is None:
+            logger.error('Cannot use batch mode without explicit output folder.')
+            exit(1)
+        
+        reader = mboxReader.MBocReader()
+        mails = reader.parseMBoxFile(args.read_mbox[0])
+        for mail in mails:
+            body = mp.parseMail(mail)
+            cp.parseMail(body)
+            filename = cp.getFilename(args.output_folder[0])
+            logger.info('Using file %s to generate the output.', filename)
+            folder = os.path.dirname(filename)
+            os.makedirs(folder, exist_ok=True)
+            with open(filename, 'w') as fp:
+                fp.write(cp.getContent())
+    else:
+        raise Exception('Not yet implemented')
+
--- a/scripts/read-competition-notification/competitionNotificationReader/main.py
+++ b/scripts/read-competition-notification/competitionNotificationReader/main.py
@ -0,0 +1,3 @@
+import competitionNotificationReader
+
+competitionNotificationReader.main()
--- a/scripts/read-competition-notification/competitionNotificationReader/cli.py
+++ b/scripts/read-competition-notification/competitionNotificationReader/cli.py
@ -0,0 +1,11 @@
+import argparse
+
+def getArgs():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--read-mbox', nargs=1, help='Read mails from mbox file instead of stdin')
+    parser.add_argument('-o', '--output-folder', nargs=1, help='Set the output folder of the generated files.')
+    parser.add_argument('-v', '--verbose', action='count', default=0, help='Increase the verbosity')
+    parser.add_argument('--debug', action='store_true', help='Enable python debugger')
+
+    return parser.parse_args()
--- a/scripts/read-competition-notification/competitionNotificationReader/competitionParser.py
+++ b/scripts/read-competition-notification/competitionNotificationReader/competitionParser.py
@ -0,0 +1,132 @@
+import bs4
+import logging
+import re
+import os
+import jinja2
+
+class ParsingFailedEception(Exception):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+class CompetitionParser:
+
+    def __init__(self):
+        self._l = logging.getLogger(__name__)
+
+        self._partner = ''
+        self._partnerin = ''
+        self._date = ''
+        self._title = ''
+        self._number = ''
+        self._group = ''
+        self._class = ''
+        self._section = ''
+        self._ort = ''
+        self._verein = ''
+        self._telefon = ''
+
+        self._reName = re.compile('Neue Meldung für (.*) / (.*)!')
+        self._reDate = re.compile('([0-9]+)\.([0-9]+)\.([0-9]+)')
+        self._reNumber = re.compile('Turnier: ([0-9]+)')
+        self._rePhone = re.compile('Telefon: ([0-9 /]+)')
+        self._rePlace = re.compile('Ort: (.*), (.*)')
+        self._reCompetition = re.compile('(.*) ([A-ES]) ((?:Std)|(?:Lat)|(?:Kombi))')
+
+        self._reCleaningString = re.compile('[^a-z0-9-]')
+        self._reDashes = re.compile('-+')
+    
+    def parseMail(self, body: str):
+        parser = bs4.BeautifulSoup(body, 'html.parser')
+        self._getNames(parser.h2)
+        self._parseTable(parser.table)
+
+    def _getNames(self, h2):
+        matcher = self._reName.match(h2.string)
+        if matcher is None:
+            self._l.error('Parsing of header "%s" failed.', h2)
+            raise ParsingFailedEception('Header could not be successfully parsed')
+        self._partner = matcher.group(1)
+        self._partnerin = matcher.group(2)
+    
+    def _parseTable(self, table):
+        def parseDate(date):
+            match = self._reDate.fullmatch(date)
+            if match is None:
+                raise ParsingFailedEception('Cannot parse date %s in mail' % date)
+            self._date = f'{match.group(3)}-{match.group(2)}-{match.group(1)}'
+        
+        def parseNumber(content):
+            match = self._reNumber.fullmatch(content)
+            if match is None:
+                raise ParsingFailedEception(f'Cannot parse the turnier number in field {content}')
+            self._number = match.group(1)
+
+        def parseCompetition(competition):
+            match = self._reCompetition.fullmatch(competition)
+            if match is None:
+                raise ParsingFailedEception(f'Cannot parse the competition line {competition}')
+            self._group = match.group(1)
+            self._class = match.group(2)
+            self._section = match.group(3)
+        
+        def parsePlace(place):
+            match = self._rePlace.fullmatch(place)
+            if match is None:
+                raise ParsingFailedEception(f'Cannot parse the place entry {place}')
+            self._verein = match.group(1)
+            self._ort = match.group(2)
+        
+        def parsePhone(phone):
+            match = self._rePhone.fullmatch(phone)
+            if match is None:
+                raise ParsingFailedEception(f'Cannot parse the phone line {phone}')
+            self._telefon = match.group(1)
+        
+        tds = table('td')
+        parseDate(tds[0].string.strip())
+        self._title = tds[1].string.strip()
+        parseNumber(tds[2].string.strip())
+        parseCompetition(tds[3].string.strip())
+        parsePlace(tds[4].string.strip())
+        parsePhone(tds[5].string.strip())
+
+    def _cleanName(self, name: str) -> str:
+        cleanedName = name.lower()
+        cleanedName = re.sub('ä', 'ae', cleanedName)
+        cleanedName = re.sub('ö', 'oe', cleanedName)
+        cleanedName = re.sub('ü', 'ue', cleanedName)
+        cleanedName = re.sub('ß', 'ss', cleanedName)
+        cleanedName = re.sub(self._reCleaningString, '-', cleanedName)
+        cleanedName = re.sub(self._reDashes, '-', cleanedName)
+        return cleanedName.lower()
+
+    def getFilename(self, prefix: str) -> str:
+        namePartner = self._cleanName(self._partner)
+        namePartnerin = self._cleanName(self._partnerin)
+        competition = f'{self._group} {self._class} {self._section}'
+        competitionName = self._cleanName(competition)
+
+        return os.path.join(
+            prefix,
+            self._date[0:4],
+            f'{self._date}-{self._ort.lower()}-{namePartner}-{namePartnerin}-{competitionName}.md'
+        )   
+    
+    def getContent(self) -> str:
+        with open(os.path.join(os.path.dirname(__file__), 'contenttemplate.md.tmpl')) as fp:
+            tpl = fp.read()
+        j2 = jinja2.Template(tpl)
+        vars = {
+            'date': self._date,
+            'partner': self._partner,
+            'partnerin': self._partnerin,
+            'verein': self._verein,
+            'ort': self._ort,
+            'telefon': self._telefon,
+            'group': self._group,
+            'class': self._class,
+            'section': self._section,
+            'title': self._title,
+            'number': self._number,
+        }
+        return j2.render(**vars)
--- a/scripts/read-competition-notification/competitionNotificationReader/contenttemplate.md.tmpl
+++ b/scripts/read-competition-notification/competitionNotificationReader/contenttemplate.md.tmpl
@ -0,0 +1,13 @@
+---
+dateCompetition: {{ date }}
+partner: "{{ partner }}"
+partnerin: "{{ partnerin }}"
+verein: "{{ verein }}"
+ort: "{{ ort }}"
+telefon: "{{ telefon }}"
+gruppe: "{{ group }}"
+klasse: "{{ class }}"
+sektion: "{{ section }}"
+titel: "{{ title }}"
+nummer: {{ number }}
+---
--- a/scripts/read-competition-notification/competitionNotificationReader/headerExtractor.py
+++ b/scripts/read-competition-notification/competitionNotificationReader/headerExtractor.py
@ -0,0 +1,30 @@
+import competitionNotificationReader as  cnr
+import logging
+
+def splitHeaders(lines: list[str]) -> cnr.mail.Mail:
+    l = logging.getLogger(__name__)
+
+    l.debug('Separating headers of an email')
+    
+    def _getHeaders(lines: list[str]):
+        headerLines = []
+        for idx,l in enumerate(lines):
+            if l == '':
+                remainingLines = lines[idx+1:]
+                for j,rl in enumerate(remainingLines):
+                    if rl.strip() != '':
+                        return headerLines, remainingLines[j:]
+                return headerLines, []
+            
+            if l.startswith('\t') or l.startswith(' '):
+                lastLine = headerLines.pop()
+                newLine = f'{lastLine[1]} {l.strip()}'
+                headerLines.append(tuple([lastLine[0], newLine]))
+            else:
+                parts = l.split(':', 1)
+                headerLines.append(tuple([parts[0].strip(), parts[1].strip()]))
+    
+    headerLines, bodyLines = _getHeaders(lines)
+
+    mail = cnr.mail.Mail(headerLines, bodyLines)
+    return mail
--- a/scripts/read-competition-notification/competitionNotificationReader/mail.py
+++ b/scripts/read-competition-notification/competitionNotificationReader/mail.py
@ -0,0 +1,11 @@
+import dataclasses
+
+HeaderName_t = str
+HeaderValue_t = str
+HeaderEntry_t = tuple[HeaderName_t, HeaderValue_t]
+
+@dataclasses.dataclass
+class Mail:
+    headers: list[HeaderEntry_t]
+    body: list[str]
+
--- a/scripts/read-competition-notification/competitionNotificationReader/mailParser.py
+++ b/scripts/read-competition-notification/competitionNotificationReader/mailParser.py
@ -0,0 +1,113 @@
+import competitionNotificationReader as cnr
+import logging
+import re
+
+class MailParser:
+    def __init__(self):
+        self._l = logging.getLogger(__name__)
+    
+    def parseMail(self, rawMail: cnr.mail.Mail):
+        # Look for the correct Mail encoding
+        contentType, boundary = self._getContentType(rawMail)
+        subMails = self._splitMultipartBody(rawMail.body, boundary)
+
+        def isCorrectContentType(mail):
+            for header in mail.headers:
+                if header[0].lower() != 'content-type':
+                    continue
+                return header[1].startswith('text/html')
+            return False
+        subMails = list(filter(isCorrectContentType, subMails))
+        
+        def isCorrectContentEncoding(mail):
+            for header in mail.headers:
+                if header[0].lower() != 'content-transfer-encoding':
+                    continue
+                return header[1] == 'quoted-printable'
+            return False
+        subMails = list(filter(isCorrectContentEncoding, subMails))
+        
+        if len(subMails) != 1:
+            raise Exception('Not implemented')
+
+        body = self._mapQuotedrintable(subMails[0].body)
+        return body
+        
+    
+    def _getContentType(self, rawMail: cnr.mail.Mail) -> str:
+        ctHeaders = list(filter(lambda x: x[0].lower() == 'content-type', rawMail.headers))
+        if len(ctHeaders) != 1:
+            self._l.error('No unique content type of the mail was found.')
+            exit(1)
+        
+        ct = ctHeaders[0][1]
+        if not ct.startswith('multipart/alternative'):
+            raise Exception('Not yet implemented')
+        
+        parser = re.compile('.*boundary="([^"]+)"')
+        matcher = parser.match(ct)
+        if matcher is None:
+            self._l.error('Cannot extract boundary from mail header.')
+            exit(1)
+        
+        boundary = matcher.group(1)
+
+        return 'multipart/alternative', boundary
+    
+    def _splitMultipartBody(self, bodyLines: list[str], boundary: str):
+        parts = []
+        subBody = []
+        for line in bodyLines:
+            if line.startswith(f'--{boundary}'):
+                if len(subBody) > 0:
+                    parts.append(subBody)
+                    subBody = []
+            else:
+                subBody.append(line)
+        return list(map(lambda x: cnr.headerExtractor.splitHeaders(x), parts))
+    
+    def _mapQuotedrintable(self, lines: list[str]):
+        def mergeLines():
+            # Drop terminating newlines
+            ret = [l for l in lines]
+            r = list(range(len(ret)))
+            r.reverse()
+            for i in r:
+                currentLine = ret[i]
+                if currentLine.endswith('='):
+                    currentLine = currentLine[:-1] + ret.pop(i+1)
+                    ret[i] = currentLine
+
+            return ret
+        
+        mergedLines = mergeLines()
+
+        def mapUnicodeChars():
+            ret = []
+            for line in mergedLines:
+                i = 0
+                chars = []
+                while i < len(line):
+                    if line[i] != '=':
+                        chars.extend(list(line[i].encode()))
+                    else:
+                        hexChars = line[i+1:i+3]
+                        value = int(hexChars, 16)
+                        # print(f'{hexChars} -> {value}')
+                        chars.append(value)
+                        i += 2
+                    i += 1
+                ret.append(chars)
+            
+            return ret
+        
+        mappedLines = mapUnicodeChars()
+
+        def decodeLine(l):
+            bytes = [x.to_bytes(1, 'big') for x in l]
+            decodedLine = b''.join(bytes).decode()
+            return decodedLine
+        decodedLines = list(map(decodeLine, mappedLines))
+
+        return ''.join(decodedLines)
+
--- a/scripts/read-competition-notification/competitionNotificationReader/mboxReader.py
+++ b/scripts/read-competition-notification/competitionNotificationReader/mboxReader.py
@ -0,0 +1,49 @@
+import logging
+import re
+import io
+
+import competitionNotificationReader as cnr
+
+class MBocReader:
+
+    def __init__(self):
+        self._l = logging.getLogger(__name__)
+    
+    def parseMBoxFile(self, filename: str) -> list[cnr.mail.Mail]:
+        self._l.debug('Reading MBox file "%s"', filename)
+
+        mails = []
+        with open(filename) as fp:
+            return self._parseMails(fp)
+    
+    def _isNewMailLine(self, line: str):
+        return line.startswith('From ')
+    
+    def _fixSingleLine(self, line: str) -> str:
+        regex = re.compile('^>+From ')
+        matcher = regex.match(line)
+
+        if matcher is None:
+            return line
+
+        return line[1:]
+    
+    def _parseMails(self, fp: io.FileIO) -> list[cnr.mail.Mail]:
+        lines = []
+        mails = []
+        while True:
+            line = fp.readline()
+            if line == '':
+                if len(lines) > 0:
+                    mails.append(self._parseSingleMail(lines))
+                return mails
+            
+            if self._isNewMailLine(line):
+                if len(lines) > 0:
+                    mails.append(self._parseSingleMail(lines))
+                lines = []
+            else:
+                lines.append(self._fixSingleLine(line[0:-1]))
+    
+    def _parseSingleMail(self, lines: list[str]) -> cnr.mail.Mail:
+        return cnr.headerExtractor.splitHeaders(lines)
--- a/scripts/read-competition-notification/requirements.txt
+++ b/scripts/read-competition-notification/requirements.txt
@ -0,0 +1,5 @@
+beautifulsoup4==4.12.2
+debugpy==1.8.0
+Jinja2==3.1.3
+MarkupSafe==2.1.3
+soupsieve==2.5