From 5afaf6a94f6e778a048849e2cadf3c0ebce99599 Mon Sep 17 00:00:00 2001 From: leca Date: Tue, 10 Jun 2025 23:01:31 +0300 Subject: [PATCH] divide into functions --- email_parser/emailparser.cpp | 199 +++++++++++++++++------------------ email_parser/emailparser.h | 3 + main.cpp | 16 +-- 3 files changed, 106 insertions(+), 112 deletions(-) diff --git a/email_parser/emailparser.cpp b/email_parser/emailparser.cpp index e103ac2..c7d7bee 100644 --- a/email_parser/emailparser.cpp +++ b/email_parser/emailparser.cpp @@ -28,110 +28,16 @@ EmailParser::EmailParser() { } std::map EmailParser::parse(std::string &email_content) { - //1. Search "Content-Type: image/.*" in the .eml file. - // 1.1 If found 0, go to [2] - // 1.2 If found 1, try decoding it, if it's not a QR code, go to [2] - // 1.3 Loop through every found entry. If not found in any, go to [2] - //2. Try decoding content of the e-mail - //3. Search "t=\d{8}T\d{4,6}&s=\d{1,6}\.\d{1,2}&fn=\d{10,16}&i=\d{6}&fp=\d{10}&n=\d". Note that in some emails = and & signs could be replaced with its code in HTTP requests: %3D, %26 - // 3.1 If not found, notify the user that we could not parse the .eml file - - /* Find image content-types */ - boost::regex images_content_type_regex("Content-Type: image/(gif|png|jpg)"); - boost::regex text_content_types_regex("Content-Type: text\\/(html|plain)"); - boost::regex part_end_regex("--.{5,48}"); // boost::regex check_data_content("t=\\d+T\\d+&s=\\d+\\.\\d+&fn=\\d{16}&i=\\d{4,5}&fp=\\d{10}&n=\\d"); boost::regex to_erase("([\\w-]+:\\s*.{2,64}\\r\\n)+"); boost::regex to_erase_two("--.{5,48}"); - std::vector> images_content_parts = find_parts(images_content_type_regex, part_end_regex, email_content); + std::string parameters; + parameters = search_in_images(email_content); + if (parameters != "") return get_params_from_string(parameters); - /* iterate through found image content-types and try searching qr codes, decode them and see if it's the needed data */ - - for (unsigned int i = 0; i < images_content_parts.size(); i ++) { - - std::string part = email_content.substr(images_content_parts[i].first, images_content_parts[i].second); - boost::erase_regex(part, to_erase); - boost::erase_regex(part, to_erase_two); - part.erase(std::remove(part.begin(), part.end(), '\r'), part.end()); - part.erase(std::remove(part.begin(), part.end(), '\n'), part.end()); - std::string decoded = base64_decode(part); - cv::Mat image; - - if (decoded.substr(1, 3) == "PNG" || decoded.substr(1, 3) == "JPG") { - std::vector data(decoded.begin(), decoded.end()); - image = cv::imdecode(cv::Mat(data), 1); - } else if (decoded.substr(0, 3) == "GIF") { - std::string gif_file_path = get_application_home_path() + "/temp.gif"; - - std::ofstream gif_output(gif_file_path, std::ios::binary); - gif_output << decoded; - gif_output.close(); - cv::VideoCapture gif(gif_file_path, cv::CAP_FFMPEG); - gif.read(image); - } - - cv::QRCodeDetector qrDecoder = cv::QRCodeDetector(); - std::string decoded_qr = qrDecoder.detectAndDecode(image); - std::string parameters = find_check_parameters(decoded_qr); - - if (parameters != "") { - std::map paramsMap = get_params_from_string(parameters); - - return paramsMap; - } - } - - /* If the E-Mail has no QR code in it as a separate part, there's posibilly a QR code inserted using html's tag with base64-encoded image. Try searching it */ - - std::vector> texts_content_parts = find_parts(text_content_types_regex, part_end_regex, email_content); - - for (unsigned int i = 0; i < texts_content_parts.size(); i ++) { - - std::string part = email_content.substr(texts_content_parts[i].first, texts_content_parts[i].second); - boost::erase_regex(part, to_erase); - boost::erase_regex(part, to_erase_two); - - - //If there's '<' character, most likely that the part's content is plain html, otherwise it's most likely a base64 encoded html. - if (part.find("<") == std::string::npos) { - part.erase(std::remove(part.begin(), part.end(), '\r'), part.end()); - part.erase(std::remove(part.begin(), part.end(), '\n'), part.end()); - part = base64_decode(part); - } - - std::string parameters = find_check_parameters(part); - - if (parameters != "") { - std::map paramsMap = get_params_from_string(parameters); - - return paramsMap; - } - - std::string url = extract_qr_url_from_img(part); - Net n; - std::string path = get_path_relative_to_home(".local/share/checks_parser/tmp"); - n.get_file(url, path); - - std::ifstream ifile(path, std::ios::in | std::ios::binary); - const unsigned int size = std::filesystem::file_size(path); - std::string qr_code_contents(size, '\0'); - ifile.read(qr_code_contents.data(), size); - - std::vector data(qr_code_contents.begin(), qr_code_contents.end()); - cv::Mat image = cv::imdecode(cv::Mat(data), 1); - - cv::QRCodeDetector qrDecoder = cv::QRCodeDetector(); - std::string decoded_qr = qrDecoder.detectAndDecode(image); - parameters = find_check_parameters(decoded_qr); - - if (parameters != "") { - std::map paramsMap = get_params_from_string(parameters); - - return paramsMap; - } - } - /* If there's no such case, the last chance is which will have a link with needed parameters or the qr code that should be downloaded and decoded */ + parameters = search_in_text(email_content); + if (parameters != "") return get_params_from_string(parameters); /* If the code has reached this part and found nothing, it's most likely that there are no QR codes at all. */ @@ -139,11 +45,6 @@ std::map EmailParser::parse(std::string &email_content } std::map EmailParser::parse_file(std::string path) { - // std::cout << "Parsing " << path << std::endl; - // std::ifstream ifile(path, std::ios::in | std::ios::binary); - // const unsigned int size = std::filesystem::file_size(path); - // std::string content(size, '\0'); - // ifile.read(content.data(), size); std::string content = read_file(path); return parse(content); return std::map(); @@ -186,3 +87,93 @@ std::string EmailParser::extract_qr_url_from_img(std::string &part) { } return url; } + +std::string EmailParser::search_in_images(std::string &content) { + boost::regex images_content_type_regex("Content-Type: image/(gif|png|jpg)"); + boost::regex part_end_regex("--.{5,48}"); + boost::regex to_erase("([\\w-]+:\\s*.{2,64}\\r\\n)+"); + boost::regex to_erase_two("--.{5,48}"); + + std::vector> images_content_parts = find_parts(images_content_type_regex, part_end_regex, content); + + /* iterate through found image content-types and try searching qr codes, decode them and see if it's the needed data */ + + for (unsigned int i = 0; i < images_content_parts.size(); i ++) { + + std::string part = content.substr(images_content_parts[i].first, images_content_parts[i].second); + boost::erase_regex(part, to_erase); + boost::erase_regex(part, to_erase_two); + part.erase(std::remove(part.begin(), part.end(), '\r'), part.end()); + part.erase(std::remove(part.begin(), part.end(), '\n'), part.end()); + std::string decoded = base64_decode(part); + cv::Mat image; + + if (decoded.substr(1, 3) == "PNG" || decoded.substr(1, 3) == "JPG") { + std::vector data(decoded.begin(), decoded.end()); + image = cv::imdecode(cv::Mat(data), 1); + } else if (decoded.substr(0, 3) == "GIF") { + std::string gif_file_path = get_application_home_path() + "/temp.gif"; + + std::ofstream gif_output(gif_file_path, std::ios::binary); + gif_output << decoded; + gif_output.close(); + cv::VideoCapture gif(gif_file_path, cv::CAP_FFMPEG); + gif.read(image); + } + + cv::QRCodeDetector qrDecoder = cv::QRCodeDetector(); + std::string decoded_qr = qrDecoder.detectAndDecode(image); + return find_check_parameters(decoded_qr); + } + return ""; +} + +std::string EmailParser::search_in_text(std::string &content) { + boost::regex text_content_types_regex("Content-Type: text\\/(html|plain)"); + boost::regex part_end_regex("--.{5,48}"); + boost::regex to_erase("([\\w-]+:\\s*.{2,64}\\r\\n)+"); + boost::regex to_erase_two("--.{5,48}"); + /* If the E-Mail has no QR code in it as a separate part, there's posibilly a QR code inserted using html's tag with base64-encoded image. Try searching it */ + + std::vector> texts_content_parts = find_parts(text_content_types_regex, part_end_regex, content); + + for (unsigned int i = 0; i < texts_content_parts.size(); i ++) { + + std::string part = content.substr(texts_content_parts[i].first, texts_content_parts[i].second); + boost::erase_regex(part, to_erase); + boost::erase_regex(part, to_erase_two); + + + //If there's '<' character, most likely that the part's content is plain html, otherwise it's most likely a base64 encoded html. + if (part.find("<") == std::string::npos) { + part.erase(std::remove(part.begin(), part.end(), '\r'), part.end()); + part.erase(std::remove(part.begin(), part.end(), '\n'), part.end()); + part = base64_decode(part); + } + + // Try searching parameters just in plain html. Will help if there's a link to a QR code with it's parameters passed in request. + std::string parameters = find_check_parameters(part); + + if (parameters != "") return parameters; + + // If there's no, try search anything that looks like a link to a qr code. + std::string url = extract_qr_url_from_img(part); + Net n; + std::string path = get_path_relative_to_home(".local/share/checks_parser/tmp"); + n.get_file(url, path); + + std::string qr_code_contents = read_file(path); + + std::vector data(qr_code_contents.begin(), qr_code_contents.end()); + cv::Mat image = cv::imdecode(cv::Mat(data), 1); + + cv::QRCodeDetector qrDecoder = cv::QRCodeDetector(); + std::string decoded_qr = qrDecoder.detectAndDecode(image); + parameters = find_check_parameters(decoded_qr); + + if (parameters != "") return parameters; + + // if there's no any link that looks like a link to QR code, maybe the qr code is encoded as base64 inside an img tag. + } + return ""; +} diff --git a/email_parser/emailparser.h b/email_parser/emailparser.h index 9452867..bcaefaf 100644 --- a/email_parser/emailparser.h +++ b/email_parser/emailparser.h @@ -13,6 +13,9 @@ public: std::vector> find_parts(const boost::regex &start_regex, const boost::regex &end_regex, const std::string &content); std::string find_check_parameters(std::string &part); std::string extract_qr_url_from_img(std::string &part); + + std::string search_in_images(std::string &content); + std::string search_in_text(std::string &content); }; #endif // CHECKS_PARSER_EMAIL_PARSER diff --git a/main.cpp b/main.cpp index da1cfb0..9a4985e 100644 --- a/main.cpp +++ b/main.cpp @@ -33,14 +33,14 @@ int main(int argc, char *argv[]) { - // EmailParser p; - // p.parse_file("/home/leca/example_email_receipts/читай_город.eml"); - // p.parse_file("/home/leca/example_email_receipts/lenta.eml"); - // p.parse_file("/home/leca/example_email_receipts/magnit.eml"); - // p.parse_file("/home/leca/example_email_receipts/pyaterochka.eml"); - // p.parse_file("/home/leca/example_email_receipts/rzd.eml"); - // p.parse_file("/home/leca/example_email_receipts/russteels.eml"); - // return 0; + EmailParser p; + p.parse_file("/home/leca/example_email_receipts/читай_город.eml"); + p.parse_file("/home/leca/example_email_receipts/lenta.eml"); + p.parse_file("/home/leca/example_email_receipts/magnit.eml"); + p.parse_file("/home/leca/example_email_receipts/pyaterochka.eml"); + p.parse_file("/home/leca/example_email_receipts/rzd.eml"); + p.parse_file("/home/leca/example_email_receipts/russteels.eml"); + return 0; curl_global_init(CURL_GLOBAL_ALL); qRegisterMetaType("Check");