further improvements

This commit is contained in:
leca 2025-06-11 01:31:07 +03:00
parent 5afaf6a94f
commit 07c7e49a21
5 changed files with 74 additions and 34 deletions

View File

@ -34,10 +34,17 @@ std::map<std::string, std::string> EmailParser::parse(std::string &email_content
std::string parameters; std::string parameters;
parameters = search_in_images(email_content); parameters = search_in_images(email_content);
if (parameters != "") return get_params_from_string(parameters);
if (parameters != "") {
std::cout << parameters << std::endl;
return get_params_from_string(parameters);
}
parameters = search_in_text(email_content); parameters = search_in_text(email_content);
if (parameters != "") return get_params_from_string(parameters); if (parameters != "") {
std::cout << parameters << std::endl;
return get_params_from_string(parameters);
}
/* If the code has reached this part and found nothing, it's most likely that there are no QR codes at all. */ /* If the code has reached this part and found nothing, it's most likely that there are no QR codes at all. */
@ -74,7 +81,6 @@ std::string EmailParser::find_check_parameters(std::string &part) {
} }
std::string EmailParser::extract_qr_url_from_img(std::string &part) { std::string EmailParser::extract_qr_url_from_img(std::string &part) {
std::string url = "";
boost::regex img_tag_regex("<img[^\\n\\r<]*>"); boost::regex img_tag_regex("<img[^\\n\\r<]*>");
boost::regex img_url_str("https?:\\/\\/.*(qr(code)?)[^\\n\\r\"]+", boost::regex::icase); boost::regex img_url_str("https?:\\/\\/.*(qr(code)?)[^\\n\\r\"]+", boost::regex::icase);
@ -85,7 +91,22 @@ std::string EmailParser::extract_qr_url_from_img(std::string &part) {
} }
} }
return url; return "";
}
std::vector<std::string> EmailParser::extract_qr_embeddings_from_part(std::string &part) {
std::vector<std::string> embeddings = {};
boost::regex img_tag_regex("<img[^\\n\\r<]*>");
boost::regex img_base64_str("data:image\\/(png|jpg);base64,[\\w+\\/=]+", boost::regex::icase);
for (boost::sregex_iterator it{part.begin(), part.end(), img_tag_regex}, end{}; it != end; it++) {
std::string img_tag = it->str();
for (boost::sregex_iterator it2{img_tag.begin(), img_tag.end(), img_base64_str}, end2{}; it2 != end2; it2++) {
embeddings.push_back(split(it2->str(), ",")[1]);
}
}
return embeddings;
} }
std::string EmailParser::search_in_images(std::string &content) { std::string EmailParser::search_in_images(std::string &content) {
@ -106,24 +127,7 @@ std::string EmailParser::search_in_images(std::string &content) {
part.erase(std::remove(part.begin(), part.end(), '\r'), part.end()); part.erase(std::remove(part.begin(), part.end(), '\r'), part.end());
part.erase(std::remove(part.begin(), part.end(), '\n'), part.end()); part.erase(std::remove(part.begin(), part.end(), '\n'), part.end());
std::string decoded = base64_decode(part); std::string decoded = base64_decode(part);
cv::Mat image; return handle_image(decoded);
if (decoded.substr(1, 3) == "PNG" || decoded.substr(1, 3) == "JPG") {
std::vector<uchar> data(decoded.begin(), decoded.end());
image = cv::imdecode(cv::Mat(data), 1);
} else if (decoded.substr(0, 3) == "GIF") {
std::string gif_file_path = get_application_home_path() + "/temp.gif";
std::ofstream gif_output(gif_file_path, std::ios::binary);
gif_output << decoded;
gif_output.close();
cv::VideoCapture gif(gif_file_path, cv::CAP_FFMPEG);
gif.read(image);
}
cv::QRCodeDetector qrDecoder = cv::QRCodeDetector();
std::string decoded_qr = qrDecoder.detectAndDecode(image);
return find_check_parameters(decoded_qr);
} }
return ""; return "";
} }
@ -158,22 +162,52 @@ std::string EmailParser::search_in_text(std::string &content) {
// If there's no, try search anything that looks like a link to a qr code. // If there's no, try search anything that looks like a link to a qr code.
std::string url = extract_qr_url_from_img(part); std::string url = extract_qr_url_from_img(part);
Net n; if (url != "") {
std::string path = get_path_relative_to_home(".local/share/checks_parser/tmp"); Net n;
n.get_file(url, path); std::string path = get_path_relative_to_home(".local/share/checks_parser/tmp");
n.get_file(url, path);
std::string qr_code_contents = read_file(path); std::string qr_code_contents = read_file(path);
std::vector<uchar> data(qr_code_contents.begin(), qr_code_contents.end()); std::vector<uchar> data(qr_code_contents.begin(), qr_code_contents.end());
cv::Mat image = cv::imdecode(cv::Mat(data), 1); cv::Mat image = cv::imdecode(cv::Mat(data), 1);
cv::QRCodeDetector qrDecoder = cv::QRCodeDetector();
std::string decoded_qr = qrDecoder.detectAndDecode(image);
parameters = find_check_parameters(decoded_qr);
cv::QRCodeDetector qrDecoder = cv::QRCodeDetector();
std::string decoded_qr = qrDecoder.detectAndDecode(image);
parameters = find_check_parameters(decoded_qr);
}
if (parameters != "") return parameters; if (parameters != "") return parameters;
// if there's no any link that looks like a link to QR code, maybe the qr code is encoded as base64 inside an img tag. // if there's no any link that looks like a link to QR code, maybe the qr code is encoded as base64 inside an img tag.
std::vector<std::string> embeddings = extract_qr_embeddings_from_part(part);
for (std::string &embedding : embeddings) {
std::string decoded = base64_decode(embedding);
parameters = handle_image(decoded);
if (parameters != "") return parameters;
}
} }
return ""; return "";
} }
std::string EmailParser::handle_image(std::string &content) {
cv::Mat image;
if (content.substr(1, 3) == "PNG" || content.substr(1, 3) == "JPG") {
std::vector<uchar> data(content.begin(), content.end());
image = cv::imdecode(cv::Mat(data), 1);
} else if (content.substr(0, 3) == "GIF") {
std::string gif_file_path = get_application_home_path() + "/temp.gif";
std::ofstream gif_output(gif_file_path, std::ios::binary);
gif_output << content;
gif_output.close();
cv::VideoCapture gif(gif_file_path, cv::CAP_FFMPEG);
gif.read(image);
}
if (image.empty()) return "";
cv::QRCodeDetector qrDecoder = cv::QRCodeDetector();
std::string decoded_qr = qrDecoder.detectAndDecode(image);
return find_check_parameters(decoded_qr);
}

View File

@ -13,9 +13,12 @@ public:
std::vector<std::pair<int, int>> find_parts(const boost::regex &start_regex, const boost::regex &end_regex, const std::string &content); std::vector<std::pair<int, int>> find_parts(const boost::regex &start_regex, const boost::regex &end_regex, const std::string &content);
std::string find_check_parameters(std::string &part); std::string find_check_parameters(std::string &part);
std::string extract_qr_url_from_img(std::string &part); std::string extract_qr_url_from_img(std::string &part);
std::vector<std::string> extract_qr_embeddings_from_part(std::string &part);
std::string search_in_images(std::string &content); std::string search_in_images(std::string &content);
std::string search_in_text(std::string &content); std::string search_in_text(std::string &content);
std::string handle_image(std::string &content);
}; };
#endif // CHECKS_PARSER_EMAIL_PARSER #endif // CHECKS_PARSER_EMAIL_PARSER

View File

@ -34,12 +34,15 @@
int main(int argc, char *argv[]) { int main(int argc, char *argv[]) {
EmailParser p; EmailParser p;
p.parse_file("/home/leca/example_email_receipts/lamoda.eml");
// p.parse_file("/home/leca/example_email_receipts/lamoda2.eml");
p.parse_file("/home/leca/example_email_receipts/читай_город.eml"); p.parse_file("/home/leca/example_email_receipts/читай_город.eml");
p.parse_file("/home/leca/example_email_receipts/lenta.eml"); p.parse_file("/home/leca/example_email_receipts/lenta.eml");
p.parse_file("/home/leca/example_email_receipts/magnit.eml"); p.parse_file("/home/leca/example_email_receipts/magnit.eml");
p.parse_file("/home/leca/example_email_receipts/pyaterochka.eml"); p.parse_file("/home/leca/example_email_receipts/pyaterochka.eml");
p.parse_file("/home/leca/example_email_receipts/rzd.eml"); p.parse_file("/home/leca/example_email_receipts/rzd.eml");
p.parse_file("/home/leca/example_email_receipts/russteels.eml"); p.parse_file("/home/leca/example_email_receipts/russteels.eml");
p.parse_file("/home/leca/example_email_receipts/avtodor.eml");
return 0; return 0;
curl_global_init(CURL_GLOBAL_ALL); curl_global_init(CURL_GLOBAL_ALL);
qRegisterMetaType<Check>("Check"); qRegisterMetaType<Check>("Check");

View File

@ -661,7 +661,7 @@
<context> <context>
<name>QObject</name> <name>QObject</name>
<message> <message>
<location filename="../main.cpp" line="74"/> <location filename="../main.cpp" line="77"/>
<source>Using locale: </source> <source>Using locale: </source>
<translation>Using locale: </translation> <translation>Using locale: </translation>
</message> </message>

View File

@ -637,7 +637,7 @@
<context> <context>
<name>QObject</name> <name>QObject</name>
<message> <message>
<location filename="../main.cpp" line="74"/> <location filename="../main.cpp" line="77"/>
<source>Using locale: </source> <source>Using locale: </source>
<translation>Использую локаль: </translation> <translation>Использую локаль: </translation>
</message> </message>