From 94acf816eabdda1b265a68ff8da6073961fc6676 Mon Sep 17 00:00:00 2001 From: leca Date: Fri, 13 Jun 2025 13:41:57 +0300 Subject: [PATCH] improvements on emailparser --- email_parser/emailparser.cpp | 104 ++++++++++++++++------------------- email_parser/emailparser.h | 2 + main.cpp | 3 +- translations/en_US.ts | 2 +- translations/ru_RU.ts | 2 +- 5 files changed, 53 insertions(+), 60 deletions(-) diff --git a/email_parser/emailparser.cpp b/email_parser/emailparser.cpp index 657861a..7e27fee 100644 --- a/email_parser/emailparser.cpp +++ b/email_parser/emailparser.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -24,34 +25,31 @@ EmailParser::EmailParser() { - + headings_regex = boost::regex("([\\w-]+:\\s*.{2,64}\\r\\n)+"); + end_marker_regex = boost::regex("--[^\\n\\r<>]{5,57}"); + part_end_regex = boost::regex("--[^\\n\\r<> ]{5,57}"); } std::map EmailParser::parse(std::string &email_content) { - // boost::regex check_data_content("t=\\d+T\\d+&s=\\d+\\.\\d+&fn=\\d{16}&i=\\d{4,5}&fp=\\d{10}&n=\\d"); - boost::regex to_erase("([\\w-]+:\\s*.{2,64}\\r\\n)+"); - boost::regex to_erase_two("--.{5,48}"); - std::string parameters; parameters = search_in_images(email_content); if (parameters != "") { - std::cout << parameters << std::endl; return get_params_from_string(parameters); } parameters = search_in_text(email_content); if (parameters != "") { - std::cout << parameters << std::endl; return get_params_from_string(parameters); } - + std::cout << "Failed to parse" << std::endl; /* If the code has reached this part and found nothing, it's most likely that there are no QR codes at all. */ return std::map(); } std::map EmailParser::parse_file(std::string path) { + std::cout << "Parsing file " << path << std::endl; std::string content = read_file(path); return parse(content); return std::map(); @@ -73,48 +71,51 @@ std::vector> EmailParser::find_parts(const boost::regex &sta } std::string EmailParser::find_check_parameters(std::string &part) { - boost::regex params_regex ("t(=|%3d)\\d+T\\d+(&|%26)s\\1\\d+\\.\\d+\\2fn\\1\\d{16}\\2i\\1\\d{3,6}\\2fp\\1\\d{9,10}\\2n\\1\\d", boost::regex::icase); - for (boost::sregex_iterator it{part.begin(), part.end(), params_regex}, end{}; it != end; it++) { - return it->str(); - } + boost::regex params_regex ("t(=|(%|=)3d)\\d+T\\d+(&(amp;)?|%26)s\\1\\d+\\.\\d+\\3fn\\1\\d{16}\\3i\\1\\d{3,6}\\3fp\\1\\d{9,10}\\3n\\1\\d", boost::regex::icase); + boost::smatch matched; + if (boost::regex_search(part, matched, params_regex)) + return matched[0].str(); + return ""; } std::string EmailParser::extract_qr_url_from_img(std::string &part) { - boost::regex img_tag_regex(""); - boost::regex img_url_str("https?:\\/\\/.*(qr(code)?)[^\\n\\r\"]+", boost::regex::icase); + boost::regex img_url_regex("(?<=str(); - for (boost::sregex_iterator it2{img_tag.begin(), img_tag.end(), img_url_str}, end2{}; it2 != end2; it2++) { - return it2->str(); - } - - } return ""; } +std::string EmailParser::extract_content_transfer_encoding(std::string &part) { + boost::regex content_transfer_encoding_regex("(?<=content-transfer-encoding: ).{0,20}(?=\\r\\n)", boost::regex::icase); + boost::smatch matched; + std::string transfer_encoding = ""; + + if (boost::regex_search(part, matched, content_transfer_encoding_regex)) + transfer_encoding = matched[0].str(); + + if (transfer_encoding == "") return ""; + std::transform(transfer_encoding.begin(), transfer_encoding.end(), transfer_encoding.begin(), ::tolower); + boost::trim(transfer_encoding); + return transfer_encoding; +} + std::vector EmailParser::extract_qr_embeddings_from_part(std::string &part) { std::vector embeddings = {}; - boost::regex img_tag_regex(""); - boost::regex img_base64_str("data:image\\/(png|jpg);base64,[\\w+\\/=]+", boost::regex::icase); - - for (boost::sregex_iterator it{part.begin(), part.end(), img_tag_regex}, end{}; it != end; it++) { - std::string img_tag = it->str(); - for (boost::sregex_iterator it2{img_tag.begin(), img_tag.end(), img_base64_str}, end2{}; it2 != end2; it2++) { - embeddings.push_back(split(it2->str(), ",")[1]); + boost::regex img_base64_regex("(?<=> images_content_parts = find_parts(images_content_type_regex, part_end_regex, content); /* iterate through found image content-types and try searching qr codes, decode them and see if it's the needed data */ @@ -122,10 +123,9 @@ std::string EmailParser::search_in_images(std::string &content) { for (unsigned int i = 0; i < images_content_parts.size(); i ++) { std::string part = content.substr(images_content_parts[i].first, images_content_parts[i].second); - boost::erase_regex(part, to_erase); - boost::erase_regex(part, to_erase_two); - part.erase(std::remove(part.begin(), part.end(), '\r'), part.end()); - part.erase(std::remove(part.begin(), part.end(), '\n'), part.end()); + boost::erase_regex(part, headings_regex); + boost::erase_regex(part, end_marker_regex); + boost::erase_all_regex(part, boost::regex("\\r\\n")); std::string decoded = base64_decode(part); return handle_image(decoded); } @@ -133,10 +133,7 @@ std::string EmailParser::search_in_images(std::string &content) { } std::string EmailParser::search_in_text(std::string &content) { - boost::regex text_content_types_regex("Content-Type: text\\/(html|plain)"); - boost::regex part_end_regex("--.{5,48}"); - boost::regex to_erase("([\\w-]+:\\s*.{2,64}\\r\\n)+"); - boost::regex to_erase_two("--.{5,48}"); + boost::regex text_content_types_regex("Content-Type: text\\/(html|plain)", boost::regex::icase); /* If the E-Mail has no QR code in it as a separate part, there's posibilly a QR code inserted using html's tag with base64-encoded image. Try searching it */ std::vector> texts_content_parts = find_parts(text_content_types_regex, part_end_regex, content); @@ -144,14 +141,15 @@ std::string EmailParser::search_in_text(std::string &content) { for (unsigned int i = 0; i < texts_content_parts.size(); i ++) { std::string part = content.substr(texts_content_parts[i].first, texts_content_parts[i].second); - boost::erase_regex(part, to_erase); - boost::erase_regex(part, to_erase_two); + std::string transfer_encoding = extract_content_transfer_encoding(part); + boost::erase_regex(part, headings_regex); + boost::erase_regex(part, end_marker_regex); - //If there's '<' character, most likely that the part's content is plain html, otherwise it's most likely a base64 encoded html. - if (part.find("<") == std::string::npos) { - part.erase(std::remove(part.begin(), part.end(), '\r'), part.end()); - part.erase(std::remove(part.begin(), part.end(), '\n'), part.end()); + if (transfer_encoding == "quoted-printable") { + boost::erase_all_regex(part, boost::regex("=\\r\\n")); + } else if (transfer_encoding == "base64") { + boost::erase_all_regex(part, boost::regex("\\r\\n")); part = base64_decode(part); } @@ -168,13 +166,7 @@ std::string EmailParser::search_in_text(std::string &content) { n.get_file(url, path); std::string qr_code_contents = read_file(path); - - std::vector data(qr_code_contents.begin(), qr_code_contents.end()); - cv::Mat image = cv::imdecode(cv::Mat(data), 1); - - cv::QRCodeDetector qrDecoder = cv::QRCodeDetector(); - std::string decoded_qr = qrDecoder.detectAndDecode(image); - parameters = find_check_parameters(decoded_qr); + parameters = handle_image(qr_code_contents); } if (parameters != "") return parameters; @@ -193,7 +185,7 @@ std::string EmailParser::search_in_text(std::string &content) { std::string EmailParser::handle_image(std::string &content) { cv::Mat image; - if (content.substr(1, 3) == "PNG" || content.substr(1, 3) == "JPG") { + if (content.substr(1, 3) == "PNG" || content.substr(1, 3) == "JPG" || content.substr(6, 4) == "JFIF") { std::vector data(content.begin(), content.end()); image = cv::imdecode(cv::Mat(data), 1); } else if (content.substr(0, 3) == "GIF") { diff --git a/email_parser/emailparser.h b/email_parser/emailparser.h index b94037b..7f80c3c 100644 --- a/email_parser/emailparser.h +++ b/email_parser/emailparser.h @@ -6,6 +6,7 @@ #include class EmailParser { + boost::regex headings_regex, end_marker_regex, part_end_regex; public: EmailParser(); std::map parse(std::string &email_content); @@ -13,6 +14,7 @@ public: std::vector> find_parts(const boost::regex &start_regex, const boost::regex &end_regex, const std::string &content); std::string find_check_parameters(std::string &part); std::string extract_qr_url_from_img(std::string &part); + std::string extract_content_transfer_encoding(std::string &part); std::vector extract_qr_embeddings_from_part(std::string &part); std::string search_in_images(std::string &content); diff --git a/main.cpp b/main.cpp index 849535a..fd25f4e 100644 --- a/main.cpp +++ b/main.cpp @@ -34,8 +34,7 @@ int main(int argc, char *argv[]) { // EmailParser p; - // p.parse_file("/home/leca/example_email_receipts/lamoda.eml"); - // // p.parse_file("/home/leca/example_email_receipts/lamoda2.eml"); + // p.parse_file("/home/leca/example_email_receipts/avito.eml"); // p.parse_file("/home/leca/example_email_receipts/читай_город.eml"); // p.parse_file("/home/leca/example_email_receipts/lenta.eml"); // p.parse_file("/home/leca/example_email_receipts/magnit.eml"); diff --git a/translations/en_US.ts b/translations/en_US.ts index cc915f7..651939a 100644 --- a/translations/en_US.ts +++ b/translations/en_US.ts @@ -671,7 +671,7 @@ QObject - + Using locale: Using locale: diff --git a/translations/ru_RU.ts b/translations/ru_RU.ts index 7d180c3..9932c3b 100644 --- a/translations/ru_RU.ts +++ b/translations/ru_RU.ts @@ -647,7 +647,7 @@ QObject - + Using locale: Использую локаль: