improvements on emailparser
This commit is contained in:
parent
4c262de8a4
commit
94acf816ea
|
@ -11,6 +11,7 @@
|
|||
#include <check/check.h>
|
||||
#include <boost/regex.hpp>
|
||||
#include <boost/algorithm/string/regex.hpp>
|
||||
#include <boost/algorithm/string.hpp>
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
|
||||
|
@ -24,34 +25,31 @@
|
|||
|
||||
|
||||
EmailParser::EmailParser() {
|
||||
|
||||
headings_regex = boost::regex("([\\w-]+:\\s*.{2,64}\\r\\n)+");
|
||||
end_marker_regex = boost::regex("--[^\\n\\r<>]{5,57}");
|
||||
part_end_regex = boost::regex("--[^\\n\\r<> ]{5,57}");
|
||||
}
|
||||
|
||||
std::map<std::string, std::string> EmailParser::parse(std::string &email_content) {
|
||||
// boost::regex check_data_content("t=\\d+T\\d+&s=\\d+\\.\\d+&fn=\\d{16}&i=\\d{4,5}&fp=\\d{10}&n=\\d");
|
||||
boost::regex to_erase("([\\w-]+:\\s*.{2,64}\\r\\n)+");
|
||||
boost::regex to_erase_two("--.{5,48}");
|
||||
|
||||
std::string parameters;
|
||||
parameters = search_in_images(email_content);
|
||||
|
||||
if (parameters != "") {
|
||||
std::cout << parameters << std::endl;
|
||||
return get_params_from_string(parameters);
|
||||
}
|
||||
|
||||
parameters = search_in_text(email_content);
|
||||
if (parameters != "") {
|
||||
std::cout << parameters << std::endl;
|
||||
return get_params_from_string(parameters);
|
||||
}
|
||||
|
||||
std::cout << "Failed to parse" << std::endl;
|
||||
/* If the code has reached this part and found nothing, it's most likely that there are no QR codes at all. */
|
||||
|
||||
return std::map<std::string, std::string>();
|
||||
}
|
||||
|
||||
std::map<std::string, std::string> EmailParser::parse_file(std::string path) {
|
||||
std::cout << "Parsing file " << path << std::endl;
|
||||
std::string content = read_file(path);
|
||||
return parse(content);
|
||||
return std::map<std::string, std::string>();
|
||||
|
@ -73,48 +71,51 @@ std::vector<std::pair<int, int>> EmailParser::find_parts(const boost::regex &sta
|
|||
}
|
||||
|
||||
std::string EmailParser::find_check_parameters(std::string &part) {
|
||||
boost::regex params_regex ("t(=|%3d)\\d+T\\d+(&|%26)s\\1\\d+\\.\\d+\\2fn\\1\\d{16}\\2i\\1\\d{3,6}\\2fp\\1\\d{9,10}\\2n\\1\\d", boost::regex::icase);
|
||||
for (boost::sregex_iterator it{part.begin(), part.end(), params_regex}, end{}; it != end; it++) {
|
||||
return it->str();
|
||||
}
|
||||
boost::regex params_regex ("t(=|(%|=)3d)\\d+T\\d+(&(amp;)?|%26)s\\1\\d+\\.\\d+\\3fn\\1\\d{16}\\3i\\1\\d{3,6}\\3fp\\1\\d{9,10}\\3n\\1\\d", boost::regex::icase);
|
||||
boost::smatch matched;
|
||||
if (boost::regex_search(part, matched, params_regex))
|
||||
return matched[0].str();
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
std::string EmailParser::extract_qr_url_from_img(std::string &part) {
|
||||
boost::regex img_tag_regex("<img[^\\n\\r<]*>");
|
||||
boost::regex img_url_str("https?:\\/\\/.*(qr(code)?)[^\\n\\r\"]+", boost::regex::icase);
|
||||
boost::regex img_url_regex("(?<=<img src=\")https:\\/\\/[^\\n\\r\"]*\\/qr(code)?[^\\n\\r\"]*", boost::regex::icase);
|
||||
boost::smatch matched;
|
||||
if (boost::regex_search(part, matched, img_url_regex))
|
||||
return matched[0].str();
|
||||
|
||||
for (boost::sregex_iterator it{part.begin(), part.end(), img_tag_regex}, end{}; it != end; it++) {
|
||||
std::string img_tag = it->str();
|
||||
for (boost::sregex_iterator it2{img_tag.begin(), img_tag.end(), img_url_str}, end2{}; it2 != end2; it2++) {
|
||||
return it2->str();
|
||||
}
|
||||
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
std::string EmailParser::extract_content_transfer_encoding(std::string &part) {
|
||||
boost::regex content_transfer_encoding_regex("(?<=content-transfer-encoding: ).{0,20}(?=\\r\\n)", boost::regex::icase);
|
||||
boost::smatch matched;
|
||||
std::string transfer_encoding = "";
|
||||
|
||||
if (boost::regex_search(part, matched, content_transfer_encoding_regex))
|
||||
transfer_encoding = matched[0].str();
|
||||
|
||||
if (transfer_encoding == "") return "";
|
||||
std::transform(transfer_encoding.begin(), transfer_encoding.end(), transfer_encoding.begin(), ::tolower);
|
||||
boost::trim(transfer_encoding);
|
||||
return transfer_encoding;
|
||||
}
|
||||
|
||||
std::vector<std::string> EmailParser::extract_qr_embeddings_from_part(std::string &part) {
|
||||
std::vector<std::string> embeddings = {};
|
||||
boost::regex img_tag_regex("<img[^\\n\\r<]*>");
|
||||
boost::regex img_base64_str("data:image\\/(png|jpg);base64,[\\w+\\/=]+", boost::regex::icase);
|
||||
|
||||
for (boost::sregex_iterator it{part.begin(), part.end(), img_tag_regex}, end{}; it != end; it++) {
|
||||
std::string img_tag = it->str();
|
||||
for (boost::sregex_iterator it2{img_tag.begin(), img_tag.end(), img_base64_str}, end2{}; it2 != end2; it2++) {
|
||||
embeddings.push_back(split(it2->str(), ",")[1]);
|
||||
boost::regex img_base64_regex("(?<=<img src=\"data:image\\/(png|jpg);base64,)[^\n\r\"]*", boost::regex::icase);
|
||||
boost::smatch matched;
|
||||
if (boost::regex_search(part, matched, img_base64_regex)) {
|
||||
for (unsigned int i = 0; i < matched.size(); i ++) {
|
||||
embeddings.push_back(matched[i].str());
|
||||
}
|
||||
|
||||
}
|
||||
return embeddings;
|
||||
}
|
||||
|
||||
std::string EmailParser::search_in_images(std::string &content) {
|
||||
boost::regex images_content_type_regex("Content-Type: image/(gif|png|jpg)");
|
||||
boost::regex part_end_regex("--.{5,48}");
|
||||
boost::regex to_erase("([\\w-]+:\\s*.{2,64}\\r\\n)+");
|
||||
boost::regex to_erase_two("--.{5,48}");
|
||||
|
||||
boost::regex images_content_type_regex("Content-Type: image/(gif|png|jpg)", boost::regex::icase);
|
||||
std::vector<std::pair<int, int>> images_content_parts = find_parts(images_content_type_regex, part_end_regex, content);
|
||||
|
||||
/* iterate through found image content-types and try searching qr codes, decode them and see if it's the needed data */
|
||||
|
@ -122,10 +123,9 @@ std::string EmailParser::search_in_images(std::string &content) {
|
|||
for (unsigned int i = 0; i < images_content_parts.size(); i ++) {
|
||||
|
||||
std::string part = content.substr(images_content_parts[i].first, images_content_parts[i].second);
|
||||
boost::erase_regex(part, to_erase);
|
||||
boost::erase_regex(part, to_erase_two);
|
||||
part.erase(std::remove(part.begin(), part.end(), '\r'), part.end());
|
||||
part.erase(std::remove(part.begin(), part.end(), '\n'), part.end());
|
||||
boost::erase_regex(part, headings_regex);
|
||||
boost::erase_regex(part, end_marker_regex);
|
||||
boost::erase_all_regex(part, boost::regex("\\r\\n"));
|
||||
std::string decoded = base64_decode(part);
|
||||
return handle_image(decoded);
|
||||
}
|
||||
|
@ -133,10 +133,7 @@ std::string EmailParser::search_in_images(std::string &content) {
|
|||
}
|
||||
|
||||
std::string EmailParser::search_in_text(std::string &content) {
|
||||
boost::regex text_content_types_regex("Content-Type: text\\/(html|plain)");
|
||||
boost::regex part_end_regex("--.{5,48}");
|
||||
boost::regex to_erase("([\\w-]+:\\s*.{2,64}\\r\\n)+");
|
||||
boost::regex to_erase_two("--.{5,48}");
|
||||
boost::regex text_content_types_regex("Content-Type: text\\/(html|plain)", boost::regex::icase);
|
||||
/* If the E-Mail has no QR code in it as a separate part, there's posibilly a QR code inserted using html's tag <img> with base64-encoded image. Try searching it */
|
||||
|
||||
std::vector<std::pair<int, int>> texts_content_parts = find_parts(text_content_types_regex, part_end_regex, content);
|
||||
|
@ -144,14 +141,15 @@ std::string EmailParser::search_in_text(std::string &content) {
|
|||
for (unsigned int i = 0; i < texts_content_parts.size(); i ++) {
|
||||
|
||||
std::string part = content.substr(texts_content_parts[i].first, texts_content_parts[i].second);
|
||||
boost::erase_regex(part, to_erase);
|
||||
boost::erase_regex(part, to_erase_two);
|
||||
std::string transfer_encoding = extract_content_transfer_encoding(part);
|
||||
|
||||
boost::erase_regex(part, headings_regex);
|
||||
boost::erase_regex(part, end_marker_regex);
|
||||
|
||||
//If there's '<' character, most likely that the part's content is plain html, otherwise it's most likely a base64 encoded html.
|
||||
if (part.find("<") == std::string::npos) {
|
||||
part.erase(std::remove(part.begin(), part.end(), '\r'), part.end());
|
||||
part.erase(std::remove(part.begin(), part.end(), '\n'), part.end());
|
||||
if (transfer_encoding == "quoted-printable") {
|
||||
boost::erase_all_regex(part, boost::regex("=\\r\\n"));
|
||||
} else if (transfer_encoding == "base64") {
|
||||
boost::erase_all_regex(part, boost::regex("\\r\\n"));
|
||||
part = base64_decode(part);
|
||||
}
|
||||
|
||||
|
@ -168,13 +166,7 @@ std::string EmailParser::search_in_text(std::string &content) {
|
|||
n.get_file(url, path);
|
||||
|
||||
std::string qr_code_contents = read_file(path);
|
||||
|
||||
std::vector<uchar> data(qr_code_contents.begin(), qr_code_contents.end());
|
||||
cv::Mat image = cv::imdecode(cv::Mat(data), 1);
|
||||
|
||||
cv::QRCodeDetector qrDecoder = cv::QRCodeDetector();
|
||||
std::string decoded_qr = qrDecoder.detectAndDecode(image);
|
||||
parameters = find_check_parameters(decoded_qr);
|
||||
parameters = handle_image(qr_code_contents);
|
||||
}
|
||||
if (parameters != "") return parameters;
|
||||
|
||||
|
@ -193,7 +185,7 @@ std::string EmailParser::search_in_text(std::string &content) {
|
|||
std::string EmailParser::handle_image(std::string &content) {
|
||||
cv::Mat image;
|
||||
|
||||
if (content.substr(1, 3) == "PNG" || content.substr(1, 3) == "JPG") {
|
||||
if (content.substr(1, 3) == "PNG" || content.substr(1, 3) == "JPG" || content.substr(6, 4) == "JFIF") {
|
||||
std::vector<uchar> data(content.begin(), content.end());
|
||||
image = cv::imdecode(cv::Mat(data), 1);
|
||||
} else if (content.substr(0, 3) == "GIF") {
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
#include <boost/regex.hpp>
|
||||
|
||||
class EmailParser {
|
||||
boost::regex headings_regex, end_marker_regex, part_end_regex;
|
||||
public:
|
||||
EmailParser();
|
||||
std::map<std::string, std::string> parse(std::string &email_content);
|
||||
|
@ -13,6 +14,7 @@ public:
|
|||
std::vector<std::pair<int, int>> find_parts(const boost::regex &start_regex, const boost::regex &end_regex, const std::string &content);
|
||||
std::string find_check_parameters(std::string &part);
|
||||
std::string extract_qr_url_from_img(std::string &part);
|
||||
std::string extract_content_transfer_encoding(std::string &part);
|
||||
std::vector<std::string> extract_qr_embeddings_from_part(std::string &part);
|
||||
|
||||
std::string search_in_images(std::string &content);
|
||||
|
|
3
main.cpp
3
main.cpp
|
@ -34,8 +34,7 @@
|
|||
int main(int argc, char *argv[]) {
|
||||
|
||||
// EmailParser p;
|
||||
// p.parse_file("/home/leca/example_email_receipts/lamoda.eml");
|
||||
// // p.parse_file("/home/leca/example_email_receipts/lamoda2.eml");
|
||||
// p.parse_file("/home/leca/example_email_receipts/avito.eml");
|
||||
// p.parse_file("/home/leca/example_email_receipts/читай_город.eml");
|
||||
// p.parse_file("/home/leca/example_email_receipts/lenta.eml");
|
||||
// p.parse_file("/home/leca/example_email_receipts/magnit.eml");
|
||||
|
|
|
@ -671,7 +671,7 @@
|
|||
<context>
|
||||
<name>QObject</name>
|
||||
<message>
|
||||
<location filename="../main.cpp" line="77"/>
|
||||
<location filename="../main.cpp" line="76"/>
|
||||
<source>Using locale: </source>
|
||||
<translation>Using locale: </translation>
|
||||
</message>
|
||||
|
|
|
@ -647,7 +647,7 @@
|
|||
<context>
|
||||
<name>QObject</name>
|
||||
<message>
|
||||
<location filename="../main.cpp" line="77"/>
|
||||
<location filename="../main.cpp" line="76"/>
|
||||
<source>Using locale: </source>
|
||||
<translation>Использую локаль: </translation>
|
||||
</message>
|
||||
|
|
Loading…
Reference in New Issue