checks-parser/email_parser/emailparser.cpp

206 lines
8.1 KiB
C++

#include <email_parser/emailparser.h>
#include <opencv2/opencv.hpp>
#include <opencv2/imgcodecs.hpp>
#include <opencv2/objdetect.hpp>
#include <opencv2/core/core.hpp>
#include <opencv2/videoio.hpp>
#include <utils/utils.h>
#include <utils/base64.h>
#include <check/check.h>
#include <boost/regex.hpp>
#include <boost/algorithm/string/regex.hpp>
#include <boost/algorithm/string.hpp>
#include <iostream>
#include <fstream>
#if __GNUC__ < 8 && __clang_major__ < 17
# include <experimental/filesystem>
using namespace std::experimental::filesystem;
#else
# include <filesystem>
using namespace std::filesystem;
#endif
EmailParser::EmailParser() {
headings_regex = boost::regex("([\\w-]+:\\s*.{2,64}\\r\\n)+");
end_marker_regex = boost::regex("--[^\\n\\r<>]{5,57}");
part_end_regex = boost::regex("--[^\\n\\r<> ]{5,57}");
}
std::map<std::string, std::string> EmailParser::parse(std::string &email_content) {
std::string parameters;
parameters = search_in_images(email_content);
if (parameters != "") {
return get_params_from_string(parameters);
}
parameters = search_in_text(email_content);
if (parameters != "") {
return get_params_from_string(parameters);
}
std::cout << "Failed to parse" << std::endl;
/* If the code has reached this part and found nothing, it's most likely that there are no QR codes at all. */
return std::map<std::string, std::string>();
}
std::map<std::string, std::string> EmailParser::parse_file(std::string path) {
std::cout << "Parsing file " << path << std::endl;
std::string content = read_file(path);
return parse(content);
return std::map<std::string, std::string>();
}
std::vector<std::pair<int, int>> EmailParser::find_parts(const boost::regex &start_regex, const boost::regex &end_regex, const std::string &content) {
std::vector<std::pair<int, int>> parts = {};
for (boost::sregex_iterator it{content.begin(), content.end(), start_regex}, end{}; it != end; it ++) {
unsigned int start_position = it->position(), end_position = content.length();
for (boost::sregex_iterator it2{content.begin() + start_position, content.end(), end_regex}, end2{}; it2 != end2; it2++) {
end_position = it2->position();
break;
}
parts.push_back(std::pair<int, int>(start_position, end_position));
}
return parts;
}
std::string EmailParser::find_check_parameters(std::string &part) {
boost::regex params_regex ("t(=|(%|=)3d)\\d+T\\d+(&(amp;)?|%26)s\\1\\d+\\.\\d+\\3fn\\1\\d{16}\\3i\\1\\d{3,6}\\3fp\\1\\d{9,10}\\3n\\1\\d", boost::regex::icase);
boost::smatch matched;
if (boost::regex_search(part, matched, params_regex))
return matched[0].str();
return "";
}
std::string EmailParser::extract_qr_url_from_img(std::string &part) {
boost::regex img_url_regex("(?<=<img src=\")https:\\/\\/[^\\n\\r\"]*\\/qr(code)?[^\\n\\r\"]*", boost::regex::icase);
boost::smatch matched;
if (boost::regex_search(part, matched, img_url_regex))
return matched[0].str();
return "";
}
std::string EmailParser::extract_content_transfer_encoding(std::string &part) {
boost::regex content_transfer_encoding_regex("(?<=content-transfer-encoding: ).{0,20}(?=\\r\\n)", boost::regex::icase);
boost::smatch matched;
std::string transfer_encoding = "";
if (boost::regex_search(part, matched, content_transfer_encoding_regex))
transfer_encoding = matched[0].str();
if (transfer_encoding == "") return "";
std::transform(transfer_encoding.begin(), transfer_encoding.end(), transfer_encoding.begin(), ::tolower);
boost::trim(transfer_encoding);
return transfer_encoding;
}
std::vector<std::string> EmailParser::extract_qr_embeddings_from_part(std::string &part) {
std::vector<std::string> embeddings = {};
boost::regex img_base64_regex("(?<=<img src=\"data:image\\/(png|jpg);base64,)[^\n\r\"]*", boost::regex::icase);
boost::smatch matched;
if (boost::regex_search(part, matched, img_base64_regex)) {
for (unsigned int i = 0; i < matched.size(); i ++) {
embeddings.push_back(matched[i].str());
}
}
return embeddings;
}
std::string EmailParser::search_in_images(std::string &content) {
boost::regex images_content_type_regex("Content-Type: image/(gif|png|jpg)", boost::regex::icase);
std::vector<std::pair<int, int>> images_content_parts = find_parts(images_content_type_regex, part_end_regex, content);
/* iterate through found image content-types and try searching qr codes, decode them and see if it's the needed data */
for (unsigned int i = 0; i < images_content_parts.size(); i ++) {
std::string part = content.substr(images_content_parts[i].first, images_content_parts[i].second);
boost::erase_regex(part, headings_regex);
boost::erase_regex(part, end_marker_regex);
boost::erase_all_regex(part, boost::regex("\\r\\n"));
std::string decoded = base64_decode(part);
return handle_image(decoded);
}
return "";
}
std::string EmailParser::search_in_text(std::string &content) {
boost::regex text_content_types_regex("Content-Type: text\\/(html|plain)", boost::regex::icase);
/* If the E-Mail has no QR code in it as a separate part, there's posibilly a QR code inserted using html's tag <img> with base64-encoded image. Try searching it */
std::vector<std::pair<int, int>> texts_content_parts = find_parts(text_content_types_regex, part_end_regex, content);
for (unsigned int i = 0; i < texts_content_parts.size(); i ++) {
std::string part = content.substr(texts_content_parts[i].first, texts_content_parts[i].second);
std::string transfer_encoding = extract_content_transfer_encoding(part);
boost::erase_regex(part, headings_regex);
boost::erase_regex(part, end_marker_regex);
if (transfer_encoding == "quoted-printable") {
boost::erase_all_regex(part, boost::regex("=\\r\\n"));
} else if (transfer_encoding == "base64") {
boost::erase_all_regex(part, boost::regex("\\r\\n"));
part = base64_decode(part);
}
// Try searching parameters just in plain html. Will help if there's a link to a QR code with it's parameters passed in request.
std::string parameters = find_check_parameters(part);
if (parameters != "") return parameters;
// If there's no, try search anything that looks like a link to a qr code.
std::string url = extract_qr_url_from_img(part);
if (url != "") {
Net n;
std::string path = get_path_relative_to_home(".local/share/checks_parser/tmp");
n.get_file(url, path);
std::string qr_code_contents = read_file(path);
parameters = handle_image(qr_code_contents);
}
if (parameters != "") return parameters;
// if there's no any link that looks like a link to QR code, maybe the qr code is encoded as base64 inside an img tag.
std::vector<std::string> embeddings = extract_qr_embeddings_from_part(part);
for (std::string &embedding : embeddings) {
std::string decoded = base64_decode(embedding);
parameters = handle_image(decoded);
if (parameters != "") return parameters;
}
}
return "";
}
std::string EmailParser::handle_image(std::string &content) {
cv::Mat image;
if (content.substr(1, 3) == "PNG" || content.substr(1, 3) == "JPG" || content.substr(6, 4) == "JFIF") {
std::vector<uchar> data(content.begin(), content.end());
image = cv::imdecode(cv::Mat(data), 1);
} else if (content.substr(0, 3) == "GIF") {
std::string gif_file_path = get_application_home_path() + "/temp.gif";
std::ofstream gif_output(gif_file_path, std::ios::binary);
gif_output << content;
gif_output.close();
cv::VideoCapture gif(gif_file_path, cv::CAP_FFMPEG);
gif.read(image);
}
if (image.empty()) return "";
cv::QRCodeDetector qrDecoder = cv::QRCodeDetector();
std::string decoded_qr = qrDecoder.detectAndDecode(image);
return find_check_parameters(decoded_qr);
}