divide into functions
This commit is contained in:
parent
c9f447009b
commit
5afaf6a94f
|
@ -28,110 +28,16 @@ EmailParser::EmailParser() {
|
||||||
}
|
}
|
||||||
|
|
||||||
std::map<std::string, std::string> EmailParser::parse(std::string &email_content) {
|
std::map<std::string, std::string> EmailParser::parse(std::string &email_content) {
|
||||||
//1. Search "Content-Type: image/.*" in the .eml file.
|
|
||||||
// 1.1 If found 0, go to [2]
|
|
||||||
// 1.2 If found 1, try decoding it, if it's not a QR code, go to [2]
|
|
||||||
// 1.3 Loop through every found entry. If not found in any, go to [2]
|
|
||||||
//2. Try decoding content of the e-mail
|
|
||||||
//3. Search "t=\d{8}T\d{4,6}&s=\d{1,6}\.\d{1,2}&fn=\d{10,16}&i=\d{6}&fp=\d{10}&n=\d". Note that in some emails = and & signs could be replaced with its code in HTTP requests: %3D, %26
|
|
||||||
// 3.1 If not found, notify the user that we could not parse the .eml file
|
|
||||||
|
|
||||||
/* Find image content-types */
|
|
||||||
boost::regex images_content_type_regex("Content-Type: image/(gif|png|jpg)");
|
|
||||||
boost::regex text_content_types_regex("Content-Type: text\\/(html|plain)");
|
|
||||||
boost::regex part_end_regex("--.{5,48}");
|
|
||||||
// boost::regex check_data_content("t=\\d+T\\d+&s=\\d+\\.\\d+&fn=\\d{16}&i=\\d{4,5}&fp=\\d{10}&n=\\d");
|
// boost::regex check_data_content("t=\\d+T\\d+&s=\\d+\\.\\d+&fn=\\d{16}&i=\\d{4,5}&fp=\\d{10}&n=\\d");
|
||||||
boost::regex to_erase("([\\w-]+:\\s*.{2,64}\\r\\n)+");
|
boost::regex to_erase("([\\w-]+:\\s*.{2,64}\\r\\n)+");
|
||||||
boost::regex to_erase_two("--.{5,48}");
|
boost::regex to_erase_two("--.{5,48}");
|
||||||
|
|
||||||
std::vector<std::pair<int, int>> images_content_parts = find_parts(images_content_type_regex, part_end_regex, email_content);
|
std::string parameters;
|
||||||
|
parameters = search_in_images(email_content);
|
||||||
|
if (parameters != "") return get_params_from_string(parameters);
|
||||||
|
|
||||||
/* iterate through found image content-types and try searching qr codes, decode them and see if it's the needed data */
|
parameters = search_in_text(email_content);
|
||||||
|
if (parameters != "") return get_params_from_string(parameters);
|
||||||
for (unsigned int i = 0; i < images_content_parts.size(); i ++) {
|
|
||||||
|
|
||||||
std::string part = email_content.substr(images_content_parts[i].first, images_content_parts[i].second);
|
|
||||||
boost::erase_regex(part, to_erase);
|
|
||||||
boost::erase_regex(part, to_erase_two);
|
|
||||||
part.erase(std::remove(part.begin(), part.end(), '\r'), part.end());
|
|
||||||
part.erase(std::remove(part.begin(), part.end(), '\n'), part.end());
|
|
||||||
std::string decoded = base64_decode(part);
|
|
||||||
cv::Mat image;
|
|
||||||
|
|
||||||
if (decoded.substr(1, 3) == "PNG" || decoded.substr(1, 3) == "JPG") {
|
|
||||||
std::vector<uchar> data(decoded.begin(), decoded.end());
|
|
||||||
image = cv::imdecode(cv::Mat(data), 1);
|
|
||||||
} else if (decoded.substr(0, 3) == "GIF") {
|
|
||||||
std::string gif_file_path = get_application_home_path() + "/temp.gif";
|
|
||||||
|
|
||||||
std::ofstream gif_output(gif_file_path, std::ios::binary);
|
|
||||||
gif_output << decoded;
|
|
||||||
gif_output.close();
|
|
||||||
cv::VideoCapture gif(gif_file_path, cv::CAP_FFMPEG);
|
|
||||||
gif.read(image);
|
|
||||||
}
|
|
||||||
|
|
||||||
cv::QRCodeDetector qrDecoder = cv::QRCodeDetector();
|
|
||||||
std::string decoded_qr = qrDecoder.detectAndDecode(image);
|
|
||||||
std::string parameters = find_check_parameters(decoded_qr);
|
|
||||||
|
|
||||||
if (parameters != "") {
|
|
||||||
std::map<std::string, std::string> paramsMap = get_params_from_string(parameters);
|
|
||||||
|
|
||||||
return paramsMap;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* If the E-Mail has no QR code in it as a separate part, there's posibilly a QR code inserted using html's tag <img> with base64-encoded image. Try searching it */
|
|
||||||
|
|
||||||
std::vector<std::pair<int, int>> texts_content_parts = find_parts(text_content_types_regex, part_end_regex, email_content);
|
|
||||||
|
|
||||||
for (unsigned int i = 0; i < texts_content_parts.size(); i ++) {
|
|
||||||
|
|
||||||
std::string part = email_content.substr(texts_content_parts[i].first, texts_content_parts[i].second);
|
|
||||||
boost::erase_regex(part, to_erase);
|
|
||||||
boost::erase_regex(part, to_erase_two);
|
|
||||||
|
|
||||||
|
|
||||||
//If there's '<' character, most likely that the part's content is plain html, otherwise it's most likely a base64 encoded html.
|
|
||||||
if (part.find("<") == std::string::npos) {
|
|
||||||
part.erase(std::remove(part.begin(), part.end(), '\r'), part.end());
|
|
||||||
part.erase(std::remove(part.begin(), part.end(), '\n'), part.end());
|
|
||||||
part = base64_decode(part);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string parameters = find_check_parameters(part);
|
|
||||||
|
|
||||||
if (parameters != "") {
|
|
||||||
std::map<std::string, std::string> paramsMap = get_params_from_string(parameters);
|
|
||||||
|
|
||||||
return paramsMap;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string url = extract_qr_url_from_img(part);
|
|
||||||
Net n;
|
|
||||||
std::string path = get_path_relative_to_home(".local/share/checks_parser/tmp");
|
|
||||||
n.get_file(url, path);
|
|
||||||
|
|
||||||
std::ifstream ifile(path, std::ios::in | std::ios::binary);
|
|
||||||
const unsigned int size = std::filesystem::file_size(path);
|
|
||||||
std::string qr_code_contents(size, '\0');
|
|
||||||
ifile.read(qr_code_contents.data(), size);
|
|
||||||
|
|
||||||
std::vector<uchar> data(qr_code_contents.begin(), qr_code_contents.end());
|
|
||||||
cv::Mat image = cv::imdecode(cv::Mat(data), 1);
|
|
||||||
|
|
||||||
cv::QRCodeDetector qrDecoder = cv::QRCodeDetector();
|
|
||||||
std::string decoded_qr = qrDecoder.detectAndDecode(image);
|
|
||||||
parameters = find_check_parameters(decoded_qr);
|
|
||||||
|
|
||||||
if (parameters != "") {
|
|
||||||
std::map<std::string, std::string> paramsMap = get_params_from_string(parameters);
|
|
||||||
|
|
||||||
return paramsMap;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/* If there's no such case, the last chance is <img src="..."> which will have a link with needed parameters or the qr code that should be downloaded and decoded */
|
|
||||||
|
|
||||||
/* If the code has reached this part and found nothing, it's most likely that there are no QR codes at all. */
|
/* If the code has reached this part and found nothing, it's most likely that there are no QR codes at all. */
|
||||||
|
|
||||||
|
@ -139,11 +45,6 @@ std::map<std::string, std::string> EmailParser::parse(std::string &email_content
|
||||||
}
|
}
|
||||||
|
|
||||||
std::map<std::string, std::string> EmailParser::parse_file(std::string path) {
|
std::map<std::string, std::string> EmailParser::parse_file(std::string path) {
|
||||||
// std::cout << "Parsing " << path << std::endl;
|
|
||||||
// std::ifstream ifile(path, std::ios::in | std::ios::binary);
|
|
||||||
// const unsigned int size = std::filesystem::file_size(path);
|
|
||||||
// std::string content(size, '\0');
|
|
||||||
// ifile.read(content.data(), size);
|
|
||||||
std::string content = read_file(path);
|
std::string content = read_file(path);
|
||||||
return parse(content);
|
return parse(content);
|
||||||
return std::map<std::string, std::string>();
|
return std::map<std::string, std::string>();
|
||||||
|
@ -186,3 +87,93 @@ std::string EmailParser::extract_qr_url_from_img(std::string &part) {
|
||||||
}
|
}
|
||||||
return url;
|
return url;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string EmailParser::search_in_images(std::string &content) {
|
||||||
|
boost::regex images_content_type_regex("Content-Type: image/(gif|png|jpg)");
|
||||||
|
boost::regex part_end_regex("--.{5,48}");
|
||||||
|
boost::regex to_erase("([\\w-]+:\\s*.{2,64}\\r\\n)+");
|
||||||
|
boost::regex to_erase_two("--.{5,48}");
|
||||||
|
|
||||||
|
std::vector<std::pair<int, int>> images_content_parts = find_parts(images_content_type_regex, part_end_regex, content);
|
||||||
|
|
||||||
|
/* iterate through found image content-types and try searching qr codes, decode them and see if it's the needed data */
|
||||||
|
|
||||||
|
for (unsigned int i = 0; i < images_content_parts.size(); i ++) {
|
||||||
|
|
||||||
|
std::string part = content.substr(images_content_parts[i].first, images_content_parts[i].second);
|
||||||
|
boost::erase_regex(part, to_erase);
|
||||||
|
boost::erase_regex(part, to_erase_two);
|
||||||
|
part.erase(std::remove(part.begin(), part.end(), '\r'), part.end());
|
||||||
|
part.erase(std::remove(part.begin(), part.end(), '\n'), part.end());
|
||||||
|
std::string decoded = base64_decode(part);
|
||||||
|
cv::Mat image;
|
||||||
|
|
||||||
|
if (decoded.substr(1, 3) == "PNG" || decoded.substr(1, 3) == "JPG") {
|
||||||
|
std::vector<uchar> data(decoded.begin(), decoded.end());
|
||||||
|
image = cv::imdecode(cv::Mat(data), 1);
|
||||||
|
} else if (decoded.substr(0, 3) == "GIF") {
|
||||||
|
std::string gif_file_path = get_application_home_path() + "/temp.gif";
|
||||||
|
|
||||||
|
std::ofstream gif_output(gif_file_path, std::ios::binary);
|
||||||
|
gif_output << decoded;
|
||||||
|
gif_output.close();
|
||||||
|
cv::VideoCapture gif(gif_file_path, cv::CAP_FFMPEG);
|
||||||
|
gif.read(image);
|
||||||
|
}
|
||||||
|
|
||||||
|
cv::QRCodeDetector qrDecoder = cv::QRCodeDetector();
|
||||||
|
std::string decoded_qr = qrDecoder.detectAndDecode(image);
|
||||||
|
return find_check_parameters(decoded_qr);
|
||||||
|
}
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string EmailParser::search_in_text(std::string &content) {
|
||||||
|
boost::regex text_content_types_regex("Content-Type: text\\/(html|plain)");
|
||||||
|
boost::regex part_end_regex("--.{5,48}");
|
||||||
|
boost::regex to_erase("([\\w-]+:\\s*.{2,64}\\r\\n)+");
|
||||||
|
boost::regex to_erase_two("--.{5,48}");
|
||||||
|
/* If the E-Mail has no QR code in it as a separate part, there's posibilly a QR code inserted using html's tag <img> with base64-encoded image. Try searching it */
|
||||||
|
|
||||||
|
std::vector<std::pair<int, int>> texts_content_parts = find_parts(text_content_types_regex, part_end_regex, content);
|
||||||
|
|
||||||
|
for (unsigned int i = 0; i < texts_content_parts.size(); i ++) {
|
||||||
|
|
||||||
|
std::string part = content.substr(texts_content_parts[i].first, texts_content_parts[i].second);
|
||||||
|
boost::erase_regex(part, to_erase);
|
||||||
|
boost::erase_regex(part, to_erase_two);
|
||||||
|
|
||||||
|
|
||||||
|
//If there's '<' character, most likely that the part's content is plain html, otherwise it's most likely a base64 encoded html.
|
||||||
|
if (part.find("<") == std::string::npos) {
|
||||||
|
part.erase(std::remove(part.begin(), part.end(), '\r'), part.end());
|
||||||
|
part.erase(std::remove(part.begin(), part.end(), '\n'), part.end());
|
||||||
|
part = base64_decode(part);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try searching parameters just in plain html. Will help if there's a link to a QR code with it's parameters passed in request.
|
||||||
|
std::string parameters = find_check_parameters(part);
|
||||||
|
|
||||||
|
if (parameters != "") return parameters;
|
||||||
|
|
||||||
|
// If there's no, try search anything that looks like a link to a qr code.
|
||||||
|
std::string url = extract_qr_url_from_img(part);
|
||||||
|
Net n;
|
||||||
|
std::string path = get_path_relative_to_home(".local/share/checks_parser/tmp");
|
||||||
|
n.get_file(url, path);
|
||||||
|
|
||||||
|
std::string qr_code_contents = read_file(path);
|
||||||
|
|
||||||
|
std::vector<uchar> data(qr_code_contents.begin(), qr_code_contents.end());
|
||||||
|
cv::Mat image = cv::imdecode(cv::Mat(data), 1);
|
||||||
|
|
||||||
|
cv::QRCodeDetector qrDecoder = cv::QRCodeDetector();
|
||||||
|
std::string decoded_qr = qrDecoder.detectAndDecode(image);
|
||||||
|
parameters = find_check_parameters(decoded_qr);
|
||||||
|
|
||||||
|
if (parameters != "") return parameters;
|
||||||
|
|
||||||
|
// if there's no any link that looks like a link to QR code, maybe the qr code is encoded as base64 inside an img tag.
|
||||||
|
}
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
|
@ -13,6 +13,9 @@ public:
|
||||||
std::vector<std::pair<int, int>> find_parts(const boost::regex &start_regex, const boost::regex &end_regex, const std::string &content);
|
std::vector<std::pair<int, int>> find_parts(const boost::regex &start_regex, const boost::regex &end_regex, const std::string &content);
|
||||||
std::string find_check_parameters(std::string &part);
|
std::string find_check_parameters(std::string &part);
|
||||||
std::string extract_qr_url_from_img(std::string &part);
|
std::string extract_qr_url_from_img(std::string &part);
|
||||||
|
|
||||||
|
std::string search_in_images(std::string &content);
|
||||||
|
std::string search_in_text(std::string &content);
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif // CHECKS_PARSER_EMAIL_PARSER
|
#endif // CHECKS_PARSER_EMAIL_PARSER
|
||||||
|
|
16
main.cpp
16
main.cpp
|
@ -33,14 +33,14 @@
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
|
|
||||||
// EmailParser p;
|
EmailParser p;
|
||||||
// p.parse_file("/home/leca/example_email_receipts/читай_город.eml");
|
p.parse_file("/home/leca/example_email_receipts/читай_город.eml");
|
||||||
// p.parse_file("/home/leca/example_email_receipts/lenta.eml");
|
p.parse_file("/home/leca/example_email_receipts/lenta.eml");
|
||||||
// p.parse_file("/home/leca/example_email_receipts/magnit.eml");
|
p.parse_file("/home/leca/example_email_receipts/magnit.eml");
|
||||||
// p.parse_file("/home/leca/example_email_receipts/pyaterochka.eml");
|
p.parse_file("/home/leca/example_email_receipts/pyaterochka.eml");
|
||||||
// p.parse_file("/home/leca/example_email_receipts/rzd.eml");
|
p.parse_file("/home/leca/example_email_receipts/rzd.eml");
|
||||||
// p.parse_file("/home/leca/example_email_receipts/russteels.eml");
|
p.parse_file("/home/leca/example_email_receipts/russteels.eml");
|
||||||
// return 0;
|
return 0;
|
||||||
curl_global_init(CURL_GLOBAL_ALL);
|
curl_global_init(CURL_GLOBAL_ALL);
|
||||||
qRegisterMetaType<Check>("Check");
|
qRegisterMetaType<Check>("Check");
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue