diff --git a/main.cpp b/main.cpp index fd25f4e..3ebf3d4 100644 --- a/main.cpp +++ b/main.cpp @@ -30,9 +30,15 @@ #include #include +#include int main(int argc, char *argv[]) { + // std::string path = "/tmp/html"; + // std::string content = read_file(path); + // Check c = parseOfdRuAnswer(content); + // std::cout << + // return 0; // EmailParser p; // p.parse_file("/home/leca/example_email_receipts/avito.eml"); // p.parse_file("/home/leca/example_email_receipts/читай_город.eml"); diff --git a/translations/en_US.ts b/translations/en_US.ts index 651939a..258bac2 100644 --- a/translations/en_US.ts +++ b/translations/en_US.ts @@ -671,7 +671,7 @@ QObject - + Using locale: Using locale: diff --git a/translations/ru_RU.ts b/translations/ru_RU.ts index 9932c3b..6656b9c 100644 --- a/translations/ru_RU.ts +++ b/translations/ru_RU.ts @@ -647,7 +647,7 @@ QObject - + Using locale: Использую локаль: diff --git a/utils/utils.cpp b/utils/utils.cpp index 838715c..a28e43f 100644 --- a/utils/utils.cpp +++ b/utils/utils.cpp @@ -25,6 +25,9 @@ #include #include #include +#include +#include +#include #ifdef BUILD_OFD_BINARYEYE_SCAN std::string get_local_ip_address() { @@ -194,33 +197,40 @@ std::wstring trim_html_response(std::wstring& check) { return trimmed; } -std::vector find_in_html(std::string& html, std::string regex, std::string html_start, std::string html_end) { - boost::regex searching_regex(regex); +std::vector find_in_html(std::string& html, std::string regex) { + boost::regex searching_regex(regex, boost::match_flag_type::match_single_line); std::vector parsed; for (boost::sregex_iterator it{html.begin(), html.end(), searching_regex}, end{}; it != end; it++) { - - std::wstring found_entry = from_utf8(it->str()); + // std::wstring found_entry = from_utf8(it->str()); + parsed.push_back(from_utf8(it->str())); // std::cout << "Found: " << to_utf8(found_entry) << std::endl; - std::wstring extracted = substring_from_to(found_entry, from_utf8(html_start), from_utf8(html_end)); + // std::wstring extracted = substring_from_to(found_entry, from_utf8(html_start), from_utf8(html_end)); // std::cout << "Extracted: " << to_utf8(extracted) << std::endl; - parsed.push_back(extracted); + // parsed.push_back(extracted); } return parsed; } std::vector find_products_in_html(std::string html) { - return find_in_html(html, "
.{2,100}<\\/b><\\/div>", "
", "<\\/b><\\/div>"); + boost::regex search_regex("(?<=\\n\\s{20}
).{0,100}(?=(<\\/b>)?<\\/div>)"); + boost::regex b_regex(""); + + std::vector parsed; + for (boost::sregex_iterator it{html.begin(), html.end(), search_regex}, end{}; + it != end; it++) { + std::string found = it->str(); + boost::erase_regex(found, b_regex); + found = boost::regex_replace(found, boost::regex(" "), "?"); + parsed.push_back(from_utf8(found)); + } + return parsed; } std::vector find_amounts_in_html(std::string html) { - std::vector founds = find_in_html(html, "
\\d+(\\.|\\,)?\\d{0,3}<\\/span>", "", "<\\/span>"); - for (auto &found : founds) { - std::replace(found.begin(), found.end(), ',', '.'); - } + return find_in_html(html, "(?<=X <\\/span>)\\d+(\\.|,)\\d{2}(?=<\\/span>)"); - return founds; } std::vector find_net_weights_in_names(std::vector &names) { @@ -247,12 +257,7 @@ std::vector find_net_weights_in_names(std::vector &n } std::vector find_prices_in_html(std::string html) { - std::vector founds = find_in_html(html, "X <\\/span>\\d+(\\.|,)\\d{2}<\\/span>", "X <\\/span>", "<\\/span>"); - for (auto &found : founds) { - std::replace(found.begin(), found.end(), ',', '.'); - } - - return founds; + return find_in_html(html, "(?<=
)\\d+(\\.|\\,)?\\d{0,3}(?=<\\/span>)"); } void dumpVectorsToStderr(std::vector &products, std::vector &amounts, std::vector &net_weights, std::vector &prices) {