32 std::stringstream &alto_str) {
33 int left, top, right, bottom;
34 it->BoundingBox(level, &left, &top, &right, &bottom);
38 int height = bottom - top;
39 int width = right - left;
41 alto_str <<
" HPOS=\"" << hpos <<
"\"";
42 alto_str <<
" VPOS=\"" << vpos <<
"\"";
43 alto_str <<
" WIDTH=\"" << width <<
"\"";
44 alto_str <<
" HEIGHT=\"" << height <<
"\"";
48 alto_str <<
" WC=\"0." << wc <<
"\"";
59 begin_document =
true;
69 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
70 "<alto xmlns=\"http://www.loc.gov/standards/alto/ns-v3#\" "
71 "xmlns:xlink=\"http://www.w3.org/1999/xlink\" "
72 "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
73 "xsi:schemaLocation=\"http://www.loc.gov/standards/alto/ns-v3# "
74 "http://www.loc.gov/alto/v3/alto-3-0.xsd\">\n"
76 "\t\t<MeasurementUnit>pixel</MeasurementUnit>\n"
77 "\t\t<sourceImageInformation>\n"
84 "\t\t</sourceImageInformation>\n"
85 "\t\t<OCRProcessing ID=\"OCR_0\">\n"
86 "\t\t\t<ocrProcessingStep>\n"
87 "\t\t\t\t<processingSoftware>\n"
88 "\t\t\t\t\t<softwareName>tesseract ");
92 "\t\t\t\t</processingSoftware>\n"
93 "\t\t\t</ocrProcessingStep>\n"
94 "\t\t</OCRProcessing>\n"
97 begin_document =
false;
101 if (text ==
nullptr) {
121 begin_document(false) {}
140 int lcnt = 0, tcnt = 0, bcnt = 0, wcnt = 0;
148 int str16_len = MultiByteToWideChar(CP_ACP, 0,
input_file_.c_str(), -1,
nullptr, 0);
149 wchar_t *uni16_str =
new WCHAR[str16_len];
150 str16_len = MultiByteToWideChar(CP_ACP, 0,
input_file_.c_str(), -1, uni16_str, str16_len);
152 WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len,
nullptr, 0,
nullptr,
nullptr);
153 char *utf8_str =
new char[utf8_len];
154 WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len,
nullptr,
nullptr);
160 std::stringstream alto_str;
162 alto_str.imbue(std::locale::classic());
164 <<
"\" PHYSICAL_IMG_NR=\"" << page_number <<
"\""
165 <<
" ID=\"page_" << page_number <<
"\">\n"
166 <<
"\t\t\t<PrintSpace HPOS=\"0\" VPOS=\"0\""
178 alto_str <<
"\t\t\t\t<ComposedBlock ID=\"cblock_" << bcnt <<
"\"";
179 AddBoxToAlto(res_it,
RIL_BLOCK, alto_str);
184 alto_str <<
"\t\t\t\t\t<TextBlock ID=\"block_" << tcnt <<
"\"";
185 AddBoxToAlto(res_it,
RIL_PARA, alto_str);
190 alto_str <<
"\t\t\t\t\t\t<TextLine ID=\"line_" << lcnt <<
"\"";
195 alto_str <<
"\t\t\t\t\t\t\t<String ID=\"string_" << wcnt <<
"\"";
196 AddBoxToAlto(res_it,
RIL_WORD, alto_str);
197 alto_str <<
" CONTENT=\"";
203 int left, top, right, bottom;
208 if (grapheme && grapheme[0] != 0) {
209 alto_str <<
HOcrEscape(grapheme.get()).c_str();
218 if (last_word_in_line) {
219 alto_str <<
"\n\t\t\t\t\t\t</TextLine>\n";
225 int width = left - hpos;
226 alto_str <<
"<SP WIDTH=\"" << width <<
"\" VPOS=\"" << vpos <<
"\" HPOS=\"" << hpos
230 if (last_word_in_tblock) {
231 alto_str <<
"\t\t\t\t\t</TextBlock>\n";
235 if (last_word_in_cblock) {
236 alto_str <<
"\t\t\t\t</ComposedBlock>\n";
241 alto_str <<
"\t\t\t</PrintSpace>\n"
243 const std::string &text = alto_str.str();
245 char *result =
new char[text.length() + 1];
246 strcpy(result, text.c_str());
std::string HOcrEscape(const char *text)
const char * GetInputName()
std::string input_file_
Name used by training code.
int Recognize(ETEXT_DESC *monitor)
PAGE_RES * page_res_
The page-level data.
Tesseract * tesseract_
The underlying data object.
static const char * Version()
ResultIterator * GetIterator()
char * GetAltoText(ETEXT_DESC *monitor, int page_number)
void SetInputName(const char *name)
bool Empty(PageIteratorLevel level) const
bool BoundingBox(PageIteratorLevel level, int *left, int *top, int *right, int *bottom) const
void AppendString(const char *s)
bool BeginDocumentHandler() override
TessAltoRenderer(const char *outputbase)
bool EndDocumentHandler() override
bool AddImageHandler(TessBaseAPI *api) override
bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const override
virtual char * GetUTF8Text(PageIteratorLevel level) const
bool IsAtBeginningOf(PageIteratorLevel level) const override
bool Next(PageIteratorLevel level) override