basic email parsing
This commit is contained in:
@@ -1,13 +1,20 @@
|
||||
#include "utils/utils.h"
|
||||
#include "utils/base64.h"
|
||||
#include <email_parser/emailparser.h>
|
||||
|
||||
#include <opencv2/opencv.hpp>
|
||||
#include <opencv2/imgcodecs.hpp>
|
||||
#include <opencv2/objdetect.hpp>
|
||||
#include <opencv2/core/core.hpp>
|
||||
#include <opencv2/videoio.hpp>
|
||||
|
||||
#include <utils/utils.h>
|
||||
#include <check/check.h>
|
||||
#include <boost/regex.hpp>
|
||||
#include <sstream>
|
||||
#include <boost/algorithm/string/regex.hpp>
|
||||
#include <iostream>
|
||||
#include <bits/stdc++.h>
|
||||
#include <fstream>
|
||||
|
||||
#if __GNUC__ < 8 && __clang_major__ < 17
|
||||
# include <experimental/filesystem>
|
||||
using namespace std::experimental::filesystem;
|
||||
@@ -16,78 +23,12 @@
|
||||
using namespace std::filesystem;
|
||||
#endif
|
||||
|
||||
std::string EmailParser::get_payload_in_email(std::string &email_content) {
|
||||
boost::regex content_type_and_transfer_encoding_regex("Content-Type");
|
||||
// boost::regex body_start_regex("\r\n\r\n"); //boost::regex_constants::egrep
|
||||
// boost::smatch smatch;
|
||||
// if (boost::regex_search(email_content, smatch, body_start_regex)) {
|
||||
// return email_content.substr(smatch.position(), email_content.length());
|
||||
// }
|
||||
// return "";
|
||||
}
|
||||
|
||||
std::multimap<std::string, std::string> EmailParser::parse_email_content_types(std::string path) {
|
||||
std::ifstream input_file(path, std::ios::in);
|
||||
|
||||
std::string line = "";
|
||||
std::multimap<std::string, std::string> mail_options;
|
||||
std::string latest_key;
|
||||
while(std::getline(input_file, line)) {
|
||||
// ;
|
||||
|
||||
char first_char = line.substr(0, 1)[0];
|
||||
// if (line == "\0") {
|
||||
// break;
|
||||
// }
|
||||
|
||||
std::vector<std::string> split_by_colon = split(line, ":");
|
||||
std::string key = split_by_colon[0];
|
||||
std::string value = "";
|
||||
|
||||
for (int i = 1; i < split_by_colon.size(); i ++) value += split_by_colon[i];
|
||||
if (key != "Content-Type" ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (first_char == '\t') {
|
||||
mail_options.emplace(std::make_pair(latest_key, line));
|
||||
} else {
|
||||
// std::cout << "key: " << key << "\nvalue: " << value << std::endl;
|
||||
mail_options.emplace(std::make_pair(key, value));
|
||||
latest_key = key;
|
||||
}
|
||||
}
|
||||
return mail_options;
|
||||
}
|
||||
|
||||
// std::vector<int> EmailParser::find_base64_blocks_in_email(std::string &email_content) {
|
||||
// std::string glued_together;
|
||||
// for (auto c : email_content) {
|
||||
// if (c == '\n') continue;
|
||||
// glued_together.push_back(c);
|
||||
// }
|
||||
// boost::regex base64_regex("^[-A-Za-z0-9+/]*={0,3}$");
|
||||
// }
|
||||
|
||||
|
||||
|
||||
EmailParser::EmailParser() {
|
||||
|
||||
}
|
||||
|
||||
std::vector<int> search_content_types_in_email_content(std::string& email_content) {
|
||||
std::vector<int> content_type_positions = {};
|
||||
boost::regex image_content_type_regex("Content-Type: image/.*");
|
||||
for (boost::sregex_iterator it{email_content.begin(), email_content.end(), image_content_type_regex}, end{};
|
||||
it != end; it++) {
|
||||
content_type_positions.push_back(it->position());
|
||||
}
|
||||
return content_type_positions;
|
||||
}
|
||||
|
||||
// void find_and_decode_email_content()
|
||||
|
||||
Check EmailParser::parse(std::string &email_content) {
|
||||
std::map<std::string, std::string> EmailParser::parse(std::string &email_content) {
|
||||
//1. Search "Content-Type: image/.*" in the .eml file.
|
||||
// 1.1 If found 0, go to [2]
|
||||
// 1.2 If found 1, try decoding it, if it's not a QR code, go to [2]
|
||||
@@ -95,86 +36,72 @@ Check EmailParser::parse(std::string &email_content) {
|
||||
//2. Try decoding content of the e-mail
|
||||
//3. Search "t=\d{8}T\d{4,6}&s=\d{1,6}\.\d{1,2}&fn=\d{10,16}&i=\d{6}&fp=\d{10}&n=\d". Note that in some emails = and & signs could be replaced with its code in HTTP requests: %3D, %26
|
||||
// 3.1 If not found, notify the user that we could not parse the .eml file
|
||||
std::vector<int> content_type_positions = search_content_types_in_email_content(email_content);
|
||||
|
||||
if (content_type_positions.size() < 0) {
|
||||
/* Find content-types */
|
||||
Check c;
|
||||
std::vector<std::pair<int, int>> content_types = {};
|
||||
boost::regex content_type_regex("Content-Type: image/(gif|png|jpg)");
|
||||
boost::regex part_end_regex("--.{5,48}");
|
||||
|
||||
} else if (content_type_positions.size() == 1) {
|
||||
for (boost::sregex_iterator it{email_content.begin(), email_content.end(), content_type_regex}, end{}; it != end; it ++) {
|
||||
unsigned int start_position = it->position(), end_position = -1;
|
||||
|
||||
} else {
|
||||
|
||||
}
|
||||
|
||||
// std::string payload = get_payload_in_email(email_content);
|
||||
// Check c;
|
||||
|
||||
// std::cout << payload << std::endl;
|
||||
|
||||
// if (payload == "")
|
||||
// return c;
|
||||
|
||||
|
||||
|
||||
// return c;
|
||||
}
|
||||
|
||||
Check EmailParser::parse_file(std::string path) {
|
||||
|
||||
// std::vector<std::string> contents = read_file(path);
|
||||
// unsigned int body_start = -1;
|
||||
// for (unsigned int i = 0; i < contents.size(); i ++) {
|
||||
// std::string &line = contents[i];
|
||||
// if (line == "\r") {
|
||||
// body_start = i;
|
||||
// break;
|
||||
// }
|
||||
// }
|
||||
// if (body_start == (unsigned int) -1) throw "Not an E-Mail";
|
||||
|
||||
// for (unsigned int i = 0; i < contents.size(); i ++) {
|
||||
// std::string &line = contents[i];
|
||||
// if (line[0] == '\t') {
|
||||
// contents[i - 1] += " " + contents[i];
|
||||
// contents.erase(remove(contents.begin(), contents.end(), line), contents.end());
|
||||
// i -= 2;
|
||||
// }
|
||||
// }
|
||||
|
||||
// for (auto &line : contents) {
|
||||
// std::cout << line << std::endl;
|
||||
// }
|
||||
|
||||
// std::cout << contents[body_start + 1] << std::endl;
|
||||
// unsigned int body_start = contents.find("\r\n\r\n");
|
||||
// if (body_start == (unsigned int)-1)
|
||||
// throw "Not a E-Mail file";
|
||||
// std::cout << contents.erase(0, body_start + 4);
|
||||
// std::cout << contents << std::endl;
|
||||
|
||||
// std::vector<std::tuple<int, int>> message_parts_positions;
|
||||
|
||||
// while (contents.find("--") > 0) {
|
||||
|
||||
// }
|
||||
|
||||
return Check();
|
||||
|
||||
std::multimap<std::string, std::string> content_types = parse_email_content_types(path);
|
||||
bool found_qr_image = false;
|
||||
for (auto &content_type : content_types) {
|
||||
boost::regex image_content_type_regex("image\\/(png|gif|jpg|jpeg)");
|
||||
boost::cmatch cmatch;
|
||||
if (boost::regex_match(content_type.second.c_str(), cmatch, image_content_type_regex)) {
|
||||
std::cout << cmatch << std::endl;
|
||||
|
||||
for (boost::sregex_iterator it2{email_content.begin() + start_position, email_content.end(), part_end_regex}, end2{}; it2 != end2; it2++) {
|
||||
end_position = it2->position();
|
||||
break;
|
||||
}
|
||||
std::cout << content_type.first << ": " << content_type.second << std::endl;
|
||||
content_types.push_back(std::pair<int, int>(start_position, end_position));
|
||||
}
|
||||
|
||||
// std::ifstream ifile(path, std::ios::in | std::ios::binary);
|
||||
// const unsigned int size = std::filesystem::file_size(path);
|
||||
// std::string content(size, '\0');
|
||||
// ifile.read(content.data(), size);
|
||||
// return parse(content);
|
||||
return Check();
|
||||
/* iterate through found content-types and try searching qr codes, decode them and see if it's the needed data */
|
||||
|
||||
for (unsigned int i = 0; i < content_types.size(); i ++) {
|
||||
boost::regex to_erase("(Content.{5,64}\\r\\n)+");
|
||||
boost::regex to_erase_two("--.{5,48}");
|
||||
std::string part = email_content.substr(content_types[i].first, content_types[i].second);
|
||||
boost::erase_regex(part, to_erase);
|
||||
boost::erase_regex(part, to_erase_two);
|
||||
part.erase(std::remove(part.begin(), part.end(), '\r'), part.end());
|
||||
part.erase(std::remove(part.begin(), part.end(), '\n'), part.end());
|
||||
std::string decoded = base64_decode(part);
|
||||
cv::Mat image;
|
||||
|
||||
if (decoded.substr(1, 3) == "PNG" || decoded.substr(1, 3) == "JPG") {
|
||||
std::vector<uchar> data(decoded.begin(), decoded.end());
|
||||
image = cv::imdecode(cv::Mat(data), 1);
|
||||
} else if (decoded.substr(0, 3) == "GIF") {
|
||||
std::string gif_file_path = get_application_home_path() + "/temp.gif";
|
||||
|
||||
std::ofstream gif_output(gif_file_path, std::ios::binary);
|
||||
gif_output << decoded;
|
||||
gif_output.close();
|
||||
cv::VideoCapture gif(gif_file_path, cv::CAP_FFMPEG);
|
||||
gif.read(image);
|
||||
}
|
||||
|
||||
cv::QRCodeDetector qrDecoder = cv::QRCodeDetector();
|
||||
std::string decoded_qr_params = qrDecoder.detectAndDecode(image);
|
||||
boost::regex check_data_content("t=\\d+T\\d+&s=\\d+\\.\\d+&fn=\\d{16}&i=\\d{4,5}&fp=\\d{10}&n=\\d");
|
||||
if (boost::regex_match(decoded_qr_params, check_data_content)) {
|
||||
std::map<std::string, std::string> paramsMap = get_params_from_string(decoded_qr_params);
|
||||
|
||||
return paramsMap;
|
||||
}
|
||||
}
|
||||
|
||||
/* If the E-Mail has no QR code in it as a separate part, there's posibilly a QR code inserted using html's tag <img> with base64-encoded image. Try searching it */
|
||||
|
||||
/* If there's no such case, the last chance is <img src="..."> which will have a link with needed parameters or the qr code that should be downloaded and decoded */
|
||||
|
||||
/* If the code has reached this part and found nothing, it's most likely that there are no QR codes at all. */
|
||||
// return Check();
|
||||
}
|
||||
|
||||
std::map<std::string, std::string> EmailParser::parse_file(std::string path) {
|
||||
std::ifstream ifile(path, std::ios::in | std::ios::binary);
|
||||
const unsigned int size = std::filesystem::file_size(path);
|
||||
std::string content(size, '\0');
|
||||
ifile.read(content.data(), size);
|
||||
return parse(content);
|
||||
return std::map<std::string, std::string>();
|
||||
}
|
||||
|
||||
@@ -5,14 +5,10 @@
|
||||
#include <map>
|
||||
|
||||
class EmailParser {
|
||||
std::string get_payload_in_email(std::string &email_content);
|
||||
std::multimap<std::string, std::string> parse_email_content_types(std::string path);
|
||||
|
||||
// std::vector<int> find_base64_blocks_in_email(std::string &email_content);
|
||||
public:
|
||||
EmailParser();
|
||||
Check parse(std::string &email_content);
|
||||
Check parse_file(std::string path);
|
||||
std::map<std::string, std::string> parse(std::string &email_content);
|
||||
std::map<std::string, std::string> parse_file(std::string path);
|
||||
|
||||
};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user