improved code, handling situations where names of goods are not known

This commit is contained in:
leca 2025-06-13 18:55:36 +03:00
parent 94acf816ea
commit a3ecaeef55
4 changed files with 31 additions and 20 deletions

View File

@ -30,9 +30,15 @@
#include <QPushButton>
#include <utils/base64.h>
#include <utils/utils.h>
int main(int argc, char *argv[]) {
// std::string path = "/tmp/html";
// std::string content = read_file(path);
// Check c = parseOfdRuAnswer(content);
// std::cout <<
// return 0;
// EmailParser p;
// p.parse_file("/home/leca/example_email_receipts/avito.eml");
// p.parse_file("/home/leca/example_email_receipts/читай_город.eml");

View File

@ -671,7 +671,7 @@
<context>
<name>QObject</name>
<message>
<location filename="../main.cpp" line="76"/>
<location filename="../main.cpp" line="82"/>
<source>Using locale: </source>
<translation>Using locale: </translation>
</message>

View File

@ -647,7 +647,7 @@
<context>
<name>QObject</name>
<message>
<location filename="../main.cpp" line="76"/>
<location filename="../main.cpp" line="82"/>
<source>Using locale: </source>
<translation>Использую локаль: </translation>
</message>

View File

@ -25,6 +25,9 @@
#include <boost/regex.hpp>
#include <net/net.h>
#include <settings/settings.h>
#include <boost/regex.hpp>
#include <boost/algorithm/string/regex.hpp>
#include <boost/algorithm/string.hpp>
#ifdef BUILD_OFD_BINARYEYE_SCAN
std::string get_local_ip_address() {
@ -194,33 +197,40 @@ std::wstring trim_html_response(std::wstring& check) {
return trimmed;
}
std::vector<std::wstring> find_in_html(std::string& html, std::string regex, std::string html_start, std::string html_end) {
boost::regex searching_regex(regex);
std::vector<std::wstring> find_in_html(std::string& html, std::string regex) {
boost::regex searching_regex(regex, boost::match_flag_type::match_single_line);
std::vector<std::wstring> parsed;
for (boost::sregex_iterator it{html.begin(), html.end(), searching_regex}, end{};
it != end; it++) {
std::wstring found_entry = from_utf8(it->str());
// std::wstring found_entry = from_utf8(it->str());
parsed.push_back(from_utf8(it->str()));
// std::cout << "Found: " << to_utf8(found_entry) << std::endl;
std::wstring extracted = substring_from_to(found_entry, from_utf8(html_start), from_utf8(html_end));
// std::wstring extracted = substring_from_to(found_entry, from_utf8(html_start), from_utf8(html_end));
// std::cout << "Extracted: " << to_utf8(extracted) << std::endl;
parsed.push_back(extracted);
// parsed.push_back(extracted);
}
return parsed;
}
std::vector<std::wstring> find_products_in_html(std::string html) {
return find_in_html(html, "<div class=\"ifw-col ifw-col-1 text-left\"><b>.{2,100}<\\/b><\\/div>", "<div class=\"ifw-col ifw-col-1 text-left\"><b>", "<\\/b><\\/div>");
boost::regex search_regex("(?<=\\n\\s{20}<div class=\"ifw-col ifw-col-1 text-left\">).{0,100}(?=(<\\/b>)?<\\/div>)");
boost::regex b_regex("<b>");
std::vector<std::wstring> parsed;
for (boost::sregex_iterator it{html.begin(), html.end(), search_regex}, end{};
it != end; it++) {
std::string found = it->str();
boost::erase_regex(found, b_regex);
found = boost::regex_replace(found, boost::regex("&nbsp;"), "?");
parsed.push_back(from_utf8(found));
}
return parsed;
}
std::vector<std::wstring> find_amounts_in_html(std::string html) {
std::vector<std::wstring> founds = find_in_html(html, "<div><span>\\d+(\\.|\\,)?\\d{0,3}<\\/span>", "<span>", "<\\/span>");
for (auto &found : founds) {
std::replace(found.begin(), found.end(), ',', '.');
}
return find_in_html(html, "(?<=X <\\/span><span>)\\d+(\\.|,)\\d{2}(?=<\\/span>)");
return founds;
}
std::vector<std::wstring> find_net_weights_in_names(std::vector<std::wstring> &names) {
@ -247,12 +257,7 @@ std::vector<std::wstring> find_net_weights_in_names(std::vector<std::wstring> &n
}
std::vector<std::wstring> find_prices_in_html(std::string html) {
std::vector<std::wstring> founds = find_in_html(html, "X <\\/span><span>\\d+(\\.|,)\\d{2}<\\/span>", "X <\\/span><span>", "<\\/span>");
for (auto &found : founds) {
std::replace(found.begin(), found.end(), ',', '.');
}
return founds;
return find_in_html(html, "(?<=<div><span>)\\d+(\\.|\\,)?\\d{0,3}(?=<\\/span>)");
}
void dumpVectorsToStderr(std::vector<std::wstring> &products, std::vector<std::wstring> &amounts, std::vector<std::wstring> &net_weights, std::vector<std::wstring> &prices) {