more advanced parsing
This commit is contained in:
parent
9da589839c
commit
c9f447009b
|
@ -1,5 +1,3 @@
|
||||||
#include "utils/utils.h"
|
|
||||||
#include "utils/base64.h"
|
|
||||||
#include <email_parser/emailparser.h>
|
#include <email_parser/emailparser.h>
|
||||||
|
|
||||||
#include <opencv2/opencv.hpp>
|
#include <opencv2/opencv.hpp>
|
||||||
|
@ -9,6 +7,7 @@
|
||||||
#include <opencv2/videoio.hpp>
|
#include <opencv2/videoio.hpp>
|
||||||
|
|
||||||
#include <utils/utils.h>
|
#include <utils/utils.h>
|
||||||
|
#include <utils/base64.h>
|
||||||
#include <check/check.h>
|
#include <check/check.h>
|
||||||
#include <boost/regex.hpp>
|
#include <boost/regex.hpp>
|
||||||
#include <boost/algorithm/string/regex.hpp>
|
#include <boost/algorithm/string/regex.hpp>
|
||||||
|
@ -37,28 +36,21 @@ std::map<std::string, std::string> EmailParser::parse(std::string &email_content
|
||||||
//3. Search "t=\d{8}T\d{4,6}&s=\d{1,6}\.\d{1,2}&fn=\d{10,16}&i=\d{6}&fp=\d{10}&n=\d". Note that in some emails = and & signs could be replaced with its code in HTTP requests: %3D, %26
|
//3. Search "t=\d{8}T\d{4,6}&s=\d{1,6}\.\d{1,2}&fn=\d{10,16}&i=\d{6}&fp=\d{10}&n=\d". Note that in some emails = and & signs could be replaced with its code in HTTP requests: %3D, %26
|
||||||
// 3.1 If not found, notify the user that we could not parse the .eml file
|
// 3.1 If not found, notify the user that we could not parse the .eml file
|
||||||
|
|
||||||
/* Find content-types */
|
/* Find image content-types */
|
||||||
Check c;
|
boost::regex images_content_type_regex("Content-Type: image/(gif|png|jpg)");
|
||||||
std::vector<std::pair<int, int>> content_types = {};
|
boost::regex text_content_types_regex("Content-Type: text\\/(html|plain)");
|
||||||
boost::regex content_type_regex("Content-Type: image/(gif|png|jpg)");
|
|
||||||
boost::regex part_end_regex("--.{5,48}");
|
boost::regex part_end_regex("--.{5,48}");
|
||||||
|
// boost::regex check_data_content("t=\\d+T\\d+&s=\\d+\\.\\d+&fn=\\d{16}&i=\\d{4,5}&fp=\\d{10}&n=\\d");
|
||||||
for (boost::sregex_iterator it{email_content.begin(), email_content.end(), content_type_regex}, end{}; it != end; it ++) {
|
boost::regex to_erase("([\\w-]+:\\s*.{2,64}\\r\\n)+");
|
||||||
unsigned int start_position = it->position(), end_position = -1;
|
|
||||||
|
|
||||||
for (boost::sregex_iterator it2{email_content.begin() + start_position, email_content.end(), part_end_regex}, end2{}; it2 != end2; it2++) {
|
|
||||||
end_position = it2->position();
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
content_types.push_back(std::pair<int, int>(start_position, end_position));
|
|
||||||
}
|
|
||||||
|
|
||||||
/* iterate through found content-types and try searching qr codes, decode them and see if it's the needed data */
|
|
||||||
|
|
||||||
for (unsigned int i = 0; i < content_types.size(); i ++) {
|
|
||||||
boost::regex to_erase("(Content.{5,64}\\r\\n)+");
|
|
||||||
boost::regex to_erase_two("--.{5,48}");
|
boost::regex to_erase_two("--.{5,48}");
|
||||||
std::string part = email_content.substr(content_types[i].first, content_types[i].second);
|
|
||||||
|
std::vector<std::pair<int, int>> images_content_parts = find_parts(images_content_type_regex, part_end_regex, email_content);
|
||||||
|
|
||||||
|
/* iterate through found image content-types and try searching qr codes, decode them and see if it's the needed data */
|
||||||
|
|
||||||
|
for (unsigned int i = 0; i < images_content_parts.size(); i ++) {
|
||||||
|
|
||||||
|
std::string part = email_content.substr(images_content_parts[i].first, images_content_parts[i].second);
|
||||||
boost::erase_regex(part, to_erase);
|
boost::erase_regex(part, to_erase);
|
||||||
boost::erase_regex(part, to_erase_two);
|
boost::erase_regex(part, to_erase_two);
|
||||||
part.erase(std::remove(part.begin(), part.end(), '\r'), part.end());
|
part.erase(std::remove(part.begin(), part.end(), '\r'), part.end());
|
||||||
|
@ -80,10 +72,11 @@ std::map<std::string, std::string> EmailParser::parse(std::string &email_content
|
||||||
}
|
}
|
||||||
|
|
||||||
cv::QRCodeDetector qrDecoder = cv::QRCodeDetector();
|
cv::QRCodeDetector qrDecoder = cv::QRCodeDetector();
|
||||||
std::string decoded_qr_params = qrDecoder.detectAndDecode(image);
|
std::string decoded_qr = qrDecoder.detectAndDecode(image);
|
||||||
boost::regex check_data_content("t=\\d+T\\d+&s=\\d+\\.\\d+&fn=\\d{16}&i=\\d{4,5}&fp=\\d{10}&n=\\d");
|
std::string parameters = find_check_parameters(decoded_qr);
|
||||||
if (boost::regex_match(decoded_qr_params, check_data_content)) {
|
|
||||||
std::map<std::string, std::string> paramsMap = get_params_from_string(decoded_qr_params);
|
if (parameters != "") {
|
||||||
|
std::map<std::string, std::string> paramsMap = get_params_from_string(parameters);
|
||||||
|
|
||||||
return paramsMap;
|
return paramsMap;
|
||||||
}
|
}
|
||||||
|
@ -91,17 +84,105 @@ std::map<std::string, std::string> EmailParser::parse(std::string &email_content
|
||||||
|
|
||||||
/* If the E-Mail has no QR code in it as a separate part, there's posibilly a QR code inserted using html's tag <img> with base64-encoded image. Try searching it */
|
/* If the E-Mail has no QR code in it as a separate part, there's posibilly a QR code inserted using html's tag <img> with base64-encoded image. Try searching it */
|
||||||
|
|
||||||
|
std::vector<std::pair<int, int>> texts_content_parts = find_parts(text_content_types_regex, part_end_regex, email_content);
|
||||||
|
|
||||||
|
for (unsigned int i = 0; i < texts_content_parts.size(); i ++) {
|
||||||
|
|
||||||
|
std::string part = email_content.substr(texts_content_parts[i].first, texts_content_parts[i].second);
|
||||||
|
boost::erase_regex(part, to_erase);
|
||||||
|
boost::erase_regex(part, to_erase_two);
|
||||||
|
|
||||||
|
|
||||||
|
//If there's '<' character, most likely that the part's content is plain html, otherwise it's most likely a base64 encoded html.
|
||||||
|
if (part.find("<") == std::string::npos) {
|
||||||
|
part.erase(std::remove(part.begin(), part.end(), '\r'), part.end());
|
||||||
|
part.erase(std::remove(part.begin(), part.end(), '\n'), part.end());
|
||||||
|
part = base64_decode(part);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string parameters = find_check_parameters(part);
|
||||||
|
|
||||||
|
if (parameters != "") {
|
||||||
|
std::map<std::string, std::string> paramsMap = get_params_from_string(parameters);
|
||||||
|
|
||||||
|
return paramsMap;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string url = extract_qr_url_from_img(part);
|
||||||
|
Net n;
|
||||||
|
std::string path = get_path_relative_to_home(".local/share/checks_parser/tmp");
|
||||||
|
n.get_file(url, path);
|
||||||
|
|
||||||
|
std::ifstream ifile(path, std::ios::in | std::ios::binary);
|
||||||
|
const unsigned int size = std::filesystem::file_size(path);
|
||||||
|
std::string qr_code_contents(size, '\0');
|
||||||
|
ifile.read(qr_code_contents.data(), size);
|
||||||
|
|
||||||
|
std::vector<uchar> data(qr_code_contents.begin(), qr_code_contents.end());
|
||||||
|
cv::Mat image = cv::imdecode(cv::Mat(data), 1);
|
||||||
|
|
||||||
|
cv::QRCodeDetector qrDecoder = cv::QRCodeDetector();
|
||||||
|
std::string decoded_qr = qrDecoder.detectAndDecode(image);
|
||||||
|
parameters = find_check_parameters(decoded_qr);
|
||||||
|
|
||||||
|
if (parameters != "") {
|
||||||
|
std::map<std::string, std::string> paramsMap = get_params_from_string(parameters);
|
||||||
|
|
||||||
|
return paramsMap;
|
||||||
|
}
|
||||||
|
}
|
||||||
/* If there's no such case, the last chance is <img src="..."> which will have a link with needed parameters or the qr code that should be downloaded and decoded */
|
/* If there's no such case, the last chance is <img src="..."> which will have a link with needed parameters or the qr code that should be downloaded and decoded */
|
||||||
|
|
||||||
/* If the code has reached this part and found nothing, it's most likely that there are no QR codes at all. */
|
/* If the code has reached this part and found nothing, it's most likely that there are no QR codes at all. */
|
||||||
// return Check();
|
|
||||||
|
return std::map<std::string, std::string>();
|
||||||
}
|
}
|
||||||
|
|
||||||
std::map<std::string, std::string> EmailParser::parse_file(std::string path) {
|
std::map<std::string, std::string> EmailParser::parse_file(std::string path) {
|
||||||
std::ifstream ifile(path, std::ios::in | std::ios::binary);
|
// std::cout << "Parsing " << path << std::endl;
|
||||||
const unsigned int size = std::filesystem::file_size(path);
|
// std::ifstream ifile(path, std::ios::in | std::ios::binary);
|
||||||
std::string content(size, '\0');
|
// const unsigned int size = std::filesystem::file_size(path);
|
||||||
ifile.read(content.data(), size);
|
// std::string content(size, '\0');
|
||||||
|
// ifile.read(content.data(), size);
|
||||||
|
std::string content = read_file(path);
|
||||||
return parse(content);
|
return parse(content);
|
||||||
return std::map<std::string, std::string>();
|
return std::map<std::string, std::string>();
|
||||||
}
|
}
|
||||||
|
std::vector<std::pair<int, int>> EmailParser::find_parts(const boost::regex &start_regex, const boost::regex &end_regex, const std::string &content) {
|
||||||
|
std::vector<std::pair<int, int>> parts = {};
|
||||||
|
|
||||||
|
for (boost::sregex_iterator it{content.begin(), content.end(), start_regex}, end{}; it != end; it ++) {
|
||||||
|
unsigned int start_position = it->position(), end_position = content.length();
|
||||||
|
|
||||||
|
for (boost::sregex_iterator it2{content.begin() + start_position, content.end(), end_regex}, end2{}; it2 != end2; it2++) {
|
||||||
|
end_position = it2->position();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
parts.push_back(std::pair<int, int>(start_position, end_position));
|
||||||
|
}
|
||||||
|
|
||||||
|
return parts;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string EmailParser::find_check_parameters(std::string &part) {
|
||||||
|
boost::regex params_regex ("t(=|%3d)\\d+T\\d+(&|%26)s\\1\\d+\\.\\d+\\2fn\\1\\d{16}\\2i\\1\\d{3,6}\\2fp\\1\\d{9,10}\\2n\\1\\d", boost::regex::icase);
|
||||||
|
for (boost::sregex_iterator it{part.begin(), part.end(), params_regex}, end{}; it != end; it++) {
|
||||||
|
return it->str();
|
||||||
|
}
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string EmailParser::extract_qr_url_from_img(std::string &part) {
|
||||||
|
std::string url = "";
|
||||||
|
boost::regex img_tag_regex("<img[^\\n\\r<]*>");
|
||||||
|
boost::regex img_url_str("https?:\\/\\/.*(qr(code)?)[^\\n\\r\"]+", boost::regex::icase);
|
||||||
|
|
||||||
|
for (boost::sregex_iterator it{part.begin(), part.end(), img_tag_regex}, end{}; it != end; it++) {
|
||||||
|
std::string img_tag = it->str();
|
||||||
|
for (boost::sregex_iterator it2{img_tag.begin(), img_tag.end(), img_url_str}, end2{}; it2 != end2; it2++) {
|
||||||
|
return it2->str();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
return url;
|
||||||
|
}
|
||||||
|
|
|
@ -3,13 +3,16 @@
|
||||||
|
|
||||||
#include <check/check.h>
|
#include <check/check.h>
|
||||||
#include <map>
|
#include <map>
|
||||||
|
#include <boost/regex.hpp>
|
||||||
|
|
||||||
class EmailParser {
|
class EmailParser {
|
||||||
public:
|
public:
|
||||||
EmailParser();
|
EmailParser();
|
||||||
std::map<std::string, std::string> parse(std::string &email_content);
|
std::map<std::string, std::string> parse(std::string &email_content);
|
||||||
std::map<std::string, std::string> parse_file(std::string path);
|
std::map<std::string, std::string> parse_file(std::string path);
|
||||||
|
std::vector<std::pair<int, int>> find_parts(const boost::regex &start_regex, const boost::regex &end_regex, const std::string &content);
|
||||||
|
std::string find_check_parameters(std::string &part);
|
||||||
|
std::string extract_qr_url_from_img(std::string &part);
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif // CHECKS_PARSER_EMAIL_PARSER
|
#endif // CHECKS_PARSER_EMAIL_PARSER
|
||||||
|
|
3
main.cpp
3
main.cpp
|
@ -34,9 +34,12 @@
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
|
|
||||||
// EmailParser p;
|
// EmailParser p;
|
||||||
|
// p.parse_file("/home/leca/example_email_receipts/читай_город.eml");
|
||||||
|
// p.parse_file("/home/leca/example_email_receipts/lenta.eml");
|
||||||
// p.parse_file("/home/leca/example_email_receipts/magnit.eml");
|
// p.parse_file("/home/leca/example_email_receipts/magnit.eml");
|
||||||
// p.parse_file("/home/leca/example_email_receipts/pyaterochka.eml");
|
// p.parse_file("/home/leca/example_email_receipts/pyaterochka.eml");
|
||||||
// p.parse_file("/home/leca/example_email_receipts/rzd.eml");
|
// p.parse_file("/home/leca/example_email_receipts/rzd.eml");
|
||||||
|
// p.parse_file("/home/leca/example_email_receipts/russteels.eml");
|
||||||
// return 0;
|
// return 0;
|
||||||
curl_global_init(CURL_GLOBAL_ALL);
|
curl_global_init(CURL_GLOBAL_ALL);
|
||||||
qRegisterMetaType<Check>("Check");
|
qRegisterMetaType<Check>("Check");
|
||||||
|
|
25
net/net.cpp
25
net/net.cpp
|
@ -11,25 +11,38 @@ size_t writeCallback(void* contents, size_t size, size_t nmemb, void* userp) {
|
||||||
return totalSize;
|
return totalSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t write_data(void *buffer, size_t size, size_t nmemb, void *filename) {
|
// size_t write_data_to_file(void *buffer, size_t size, size_t nmemb, void *filename) {
|
||||||
FILE *f = fopen(((std::string *)filename)->c_str(), "w");
|
// FILE *f = fopen(((std::string *)filename)->c_str(), "wb");
|
||||||
size_t written = fwrite(buffer, size, nmemb, f);
|
// size_t written = fwrite(buffer, size, nmemb, f);
|
||||||
|
|
||||||
fclose(f);
|
// fclose(f);
|
||||||
|
|
||||||
|
// return written;
|
||||||
|
// }
|
||||||
|
|
||||||
|
size_t write_data(void *ptr, size_t size, size_t nmemb, FILE *stream) {
|
||||||
|
size_t written;
|
||||||
|
written = fwrite(ptr, size, nmemb, stream);
|
||||||
return written;
|
return written;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// size_t write_data(void *buffer, size_t size, size_t nmemb, void *string_buffer) {
|
||||||
|
// *(std::string *)string_buffer = std::string((char *)buffer);
|
||||||
|
// std::cout << (char*)buffer << std::endl;
|
||||||
|
// return size;
|
||||||
|
// }
|
||||||
|
|
||||||
void Net::get_file(std::string url, std::string filename) {
|
void Net::get_file(std::string url, std::string filename) {
|
||||||
CURL *handle = curl_easy_init();
|
CURL *handle = curl_easy_init();
|
||||||
|
|
||||||
curl_easy_setopt(handle, CURLOPT_URL, url.c_str());
|
curl_easy_setopt(handle, CURLOPT_URL, url.c_str());
|
||||||
|
FILE *f = fopen(filename.c_str(), "wb");
|
||||||
|
|
||||||
curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, write_data);
|
curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, write_data);
|
||||||
curl_easy_setopt(handle, CURLOPT_WRITEDATA, &filename);
|
curl_easy_setopt(handle, CURLOPT_WRITEDATA, f);
|
||||||
|
|
||||||
auto success = curl_easy_perform(handle);
|
auto success = curl_easy_perform(handle);
|
||||||
|
fclose(f);
|
||||||
curl_easy_cleanup(handle);
|
curl_easy_cleanup(handle);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
size_t write_data(void *buffer, size_t size, size_t nmemb, void *userp);
|
size_t write_data_to_file(void *buffer, size_t size, size_t nmemb, void *userp);
|
||||||
size_t writeCallback(void* contents, size_t size, size_t nmemb, void* userp);
|
size_t writeCallback(void* contents, size_t size, size_t nmemb, void* userp);
|
||||||
|
|
||||||
class Net
|
class Net
|
||||||
|
@ -14,6 +14,7 @@ public:
|
||||||
void get_file(std::string url, std::string filename);
|
void get_file(std::string url, std::string filename);
|
||||||
std::string fetch_check_data_from_ofdru(std::string fn, std::string fd, std::string fi, std::string datetime, int operation, int total, std::string captcha);
|
std::string fetch_check_data_from_ofdru(std::string fn, std::string fd, std::string fi, std::string datetime, int operation, int total, std::string captcha);
|
||||||
void get_captcha_from_ofdru();
|
void get_captcha_from_ofdru();
|
||||||
|
std::string get_data(std::string url);
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif // NET_H
|
#endif // NET_H
|
||||||
|
|
|
@ -661,7 +661,7 @@
|
||||||
<context>
|
<context>
|
||||||
<name>QObject</name>
|
<name>QObject</name>
|
||||||
<message>
|
<message>
|
||||||
<location filename="../main.cpp" line="71"/>
|
<location filename="../main.cpp" line="74"/>
|
||||||
<source>Using locale: </source>
|
<source>Using locale: </source>
|
||||||
<translation>Using locale: </translation>
|
<translation>Using locale: </translation>
|
||||||
</message>
|
</message>
|
||||||
|
|
|
@ -637,7 +637,7 @@
|
||||||
<context>
|
<context>
|
||||||
<name>QObject</name>
|
<name>QObject</name>
|
||||||
<message>
|
<message>
|
||||||
<location filename="../main.cpp" line="71"/>
|
<location filename="../main.cpp" line="74"/>
|
||||||
<source>Using locale: </source>
|
<source>Using locale: </source>
|
||||||
<translation>Использую локаль: </translation>
|
<translation>Использую локаль: </translation>
|
||||||
</message>
|
</message>
|
||||||
|
|
|
@ -75,7 +75,7 @@ std::string get_application_home_path() {
|
||||||
|
|
||||||
std::map<std::string, std::string> get_params_from_string(std::string parametersString) {
|
std::map<std::string, std::string> get_params_from_string(std::string parametersString) {
|
||||||
parametersString = boost::regex_replace(parametersString, boost::regex("%26"), "&");
|
parametersString = boost::regex_replace(parametersString, boost::regex("%26"), "&");
|
||||||
parametersString = boost::regex_replace(parametersString, boost::regex("%3D"), "=");
|
parametersString = boost::regex_replace(parametersString, boost::regex("%3[Dd]"), "=");
|
||||||
|
|
||||||
std::vector<std::string> parameters = split(parametersString, "&");
|
std::vector<std::string> parameters = split(parametersString, "&");
|
||||||
|
|
||||||
|
@ -352,15 +352,13 @@ void generate_qr_code(std::string data) {
|
||||||
#endif // ifdef BUILD_OFD_BINARYEYE_SCAN
|
#endif // ifdef BUILD_OFD_BINARYEYE_SCAN
|
||||||
|
|
||||||
#ifdef BUILD_EMAIL_MODE
|
#ifdef BUILD_EMAIL_MODE
|
||||||
std::vector<std::string> read_file(std::string path) {
|
std::string read_file(std::string &path) {
|
||||||
std::ifstream stream(path);
|
std::ifstream ifile(path, std::ios::in | std::ios::binary);
|
||||||
std::vector<std::string> lines;
|
const unsigned int size = std::filesystem::file_size(path);
|
||||||
std::string buffer;
|
std::string content(size, '\0');
|
||||||
while(getline(stream, buffer)) {
|
ifile.read(content.data(), size);
|
||||||
lines.push_back(buffer);
|
ifile.close();
|
||||||
}
|
return content;
|
||||||
stream.close();
|
|
||||||
return lines;
|
|
||||||
}
|
}
|
||||||
#endif // ifdef BUILD_EMAIL_MODE
|
#endif // ifdef BUILD_EMAIL_MODE
|
||||||
|
|
||||||
|
|
|
@ -50,6 +50,6 @@ std::string get_local_ip_address();
|
||||||
|
|
||||||
void fetch_and_download_modules();
|
void fetch_and_download_modules();
|
||||||
#ifdef BUILD_EMAIL_MODE
|
#ifdef BUILD_EMAIL_MODE
|
||||||
std::vector<std::string> read_file(std::string path);
|
std::string read_file(std::string &path);
|
||||||
#endif
|
#endif
|
||||||
#endif // UTILS_H
|
#endif // UTILS_H
|
||||||
|
|
Loading…
Reference in New Issue