tesseract  5.0.0
tesseract::TessPDFRenderer Class Reference

#include <renderer.h>

Inheritance diagram for tesseract::TessPDFRenderer:
tesseract::TessResultRenderer

Public Member Functions

 TessPDFRenderer (const char *outputbase, const char *datadir, bool textonly=false)
 
- Public Member Functions inherited from tesseract::TessResultRenderer
virtual ~TessResultRenderer ()
 
void insert (TessResultRenderer *next)
 
TessResultRenderernext ()
 
bool BeginDocument (const char *title)
 
bool AddImage (TessBaseAPI *api)
 
bool EndDocument ()
 
const char * file_extension () const
 
const char * title () const
 
bool happy () const
 
int imagenum () const
 

Protected Member Functions

bool BeginDocumentHandler () override
 
bool AddImageHandler (TessBaseAPI *api) override
 
bool EndDocumentHandler () override
 
- Protected Member Functions inherited from tesseract::TessResultRenderer
 TessResultRenderer (const char *outputbase, const char *extension)
 
void AppendString (const char *s)
 
void AppendData (const char *s, int len)
 

Detailed Description

Renders tesseract output into searchable PDF

Definition at line 216 of file renderer.h.

Constructor & Destructor Documentation

◆ TessPDFRenderer()

tesseract::TessPDFRenderer::TessPDFRenderer ( const char *  outputbase,
const char *  datadir,
bool  textonly = false 
)

Definition at line 183 of file pdfrenderer.cpp.

184  : TessResultRenderer(outputbase, "pdf"), datadir_(datadir) {
185  obj_ = 0;
186  textonly_ = textonly;
187  offsets_.push_back(0);
188 }
TessResultRenderer(const char *outputbase, const char *extension)
Definition: renderer.cpp:33

Member Function Documentation

◆ AddImageHandler()

bool tesseract::TessPDFRenderer::AddImageHandler ( TessBaseAPI api)
overrideprotectedvirtual

Implements tesseract::TessResultRenderer.

Definition at line 804 of file pdfrenderer.cpp.

804  {
805  Pix *pix = api->GetInputImage();
806  const char *filename = api->GetInputName();
807  int ppi = api->GetSourceYResolution();
808  if (!pix || ppi <= 0) {
809  return false;
810  }
811  double width = pixGetWidth(pix) * 72.0 / ppi;
812  double height = pixGetHeight(pix) * 72.0 / ppi;
813 
814  std::stringstream xobject;
815  // Use "C" locale (needed for int values larger than 999).
816  xobject.imbue(std::locale::classic());
817  if (!textonly_) {
818  xobject << "/XObject << /Im1 " << (obj_ + 2) << " 0 R >>\n";
819  }
820 
821  // PAGE
822  std::stringstream stream;
823  // Use "C" locale (needed for double values width and height).
824  stream.imbue(std::locale::classic());
825  stream.precision(2);
826  stream << std::fixed << obj_
827  << " 0 obj\n"
828  "<<\n"
829  " /Type /Page\n"
830  " /Parent 2 0 R\n" // Pages object
831  " /MediaBox [0 0 "
832  << width << " " << height
833  << "]\n"
834  " /Contents "
835  << (obj_ + 1)
836  << " 0 R\n" // Contents object
837  " /Resources\n"
838  " <<\n"
839  " "
840  << xobject.str() << // Image object
841  " /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n"
842  " /Font << /f-0-0 3 0 R >>\n" // Type0 Font
843  " >>\n"
844  ">>\n"
845  "endobj\n";
846  pages_.push_back(obj_);
847  AppendPDFObject(stream.str().c_str());
848 
849  // CONTENTS
850  const std::unique_ptr<char[]> pdftext(GetPDFTextObjects(api, width, height));
851  const size_t pdftext_len = strlen(pdftext.get());
852  size_t len;
853  unsigned char *comp_pdftext =
854  zlibCompress(reinterpret_cast<unsigned char *>(pdftext.get()), pdftext_len, &len);
855  long comp_pdftext_len = len;
856  stream.str("");
857  stream << obj_
858  << " 0 obj\n"
859  "<<\n"
860  " /Length "
861  << comp_pdftext_len
862  << " /Filter /FlateDecode\n"
863  ">>\n"
864  "stream\n";
865  AppendString(stream.str().c_str());
866  long objsize = stream.str().size();
867  AppendData(reinterpret_cast<char *>(comp_pdftext), comp_pdftext_len);
868  objsize += comp_pdftext_len;
869  lept_free(comp_pdftext);
870  const char *b2 =
871  "endstream\n"
872  "endobj\n";
873  AppendString(b2);
874  objsize += strlen(b2);
875  AppendPDFObjectDIY(objsize);
876 
877  if (!textonly_) {
878  char *pdf_object = nullptr;
879  int jpg_quality;
880  api->GetIntVariable("jpg_quality", &jpg_quality);
881  if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize, jpg_quality)) {
882  return false;
883  }
884  AppendData(pdf_object, objsize);
885  AppendPDFObjectDIY(objsize);
886  delete[] pdf_object;
887  }
888  return true;
889 }
void AppendString(const char *s)
Definition: renderer.cpp:111
void AppendData(const char *s, int len)
Definition: renderer.cpp:115

◆ BeginDocumentHandler()

bool tesseract::TessPDFRenderer::BeginDocumentHandler ( )
overrideprotectedvirtual

Reimplemented from tesseract::TessResultRenderer.

Definition at line 483 of file pdfrenderer.cpp.

483  {
484  AppendPDFObject("%PDF-1.5\n%\xDE\xAD\xBE\xEB\n");
485 
486  // CATALOG
487  AppendPDFObject(
488  "1 0 obj\n"
489  "<<\n"
490  " /Type /Catalog\n"
491  " /Pages 2 0 R\n"
492  ">>\nendobj\n");
493 
494  // We are reserving object #2 for the /Pages
495  // object, which I am going to create and write
496  // at the end of the PDF file.
497  AppendPDFObject("");
498 
499  // TYPE0 FONT
500  AppendPDFObject(
501  "3 0 obj\n"
502  "<<\n"
503  " /BaseFont /GlyphLessFont\n"
504  " /DescendantFonts [ 4 0 R ]\n" // CIDFontType2 font
505  " /Encoding /Identity-H\n"
506  " /Subtype /Type0\n"
507  " /ToUnicode 6 0 R\n" // ToUnicode
508  " /Type /Font\n"
509  ">>\n"
510  "endobj\n");
511 
512  // CIDFONTTYPE2
513  std::stringstream stream;
514  // Use "C" locale (needed for int values larger than 999).
515  stream.imbue(std::locale::classic());
516  stream << "4 0 obj\n"
517  "<<\n"
518  " /BaseFont /GlyphLessFont\n"
519  " /CIDToGIDMap 5 0 R\n" // CIDToGIDMap
520  " /CIDSystemInfo\n"
521  " <<\n"
522  " /Ordering (Identity)\n"
523  " /Registry (Adobe)\n"
524  " /Supplement 0\n"
525  " >>\n"
526  " /FontDescriptor 7 0 R\n" // Font descriptor
527  " /Subtype /CIDFontType2\n"
528  " /Type /Font\n"
529  " /DW "
530  << (1000 / kCharWidth)
531  << "\n"
532  ">>\n"
533  "endobj\n";
534  AppendPDFObject(stream.str().c_str());
535 
536  // CIDTOGIDMAP
537  const int kCIDToGIDMapSize = 2 * (1 << 16);
538  const std::unique_ptr<unsigned char[]> cidtogidmap(new unsigned char[kCIDToGIDMapSize]);
539  for (int i = 0; i < kCIDToGIDMapSize; i++) {
540  cidtogidmap[i] = (i % 2) ? 1 : 0;
541  }
542  size_t len;
543  unsigned char *comp = zlibCompress(cidtogidmap.get(), kCIDToGIDMapSize, &len);
544  stream.str("");
545  stream << "5 0 obj\n"
546  "<<\n"
547  " /Length "
548  << len
549  << " /Filter /FlateDecode\n"
550  ">>\n"
551  "stream\n";
552  AppendString(stream.str().c_str());
553  long objsize = stream.str().size();
554  AppendData(reinterpret_cast<char *>(comp), len);
555  objsize += len;
556  lept_free(comp);
557  const char *endstream_endobj =
558  "endstream\n"
559  "endobj\n";
560  AppendString(endstream_endobj);
561  objsize += strlen(endstream_endobj);
562  AppendPDFObjectDIY(objsize);
563 
564  const char stream2[] =
565  "/CIDInit /ProcSet findresource begin\n"
566  "12 dict begin\n"
567  "begincmap\n"
568  "/CIDSystemInfo\n"
569  "<<\n"
570  " /Registry (Adobe)\n"
571  " /Ordering (UCS)\n"
572  " /Supplement 0\n"
573  ">> def\n"
574  "/CMapName /Adobe-Identify-UCS def\n"
575  "/CMapType 2 def\n"
576  "1 begincodespacerange\n"
577  "<0000> <FFFF>\n"
578  "endcodespacerange\n"
579  "1 beginbfrange\n"
580  "<0000> <FFFF> <0000>\n"
581  "endbfrange\n"
582  "endcmap\n"
583  "CMapName currentdict /CMap defineresource pop\n"
584  "end\n"
585  "end\n";
586 
587  // TOUNICODE
588  stream.str("");
589  stream << "6 0 obj\n"
590  "<< /Length "
591  << (sizeof(stream2) - 1)
592  << " >>\n"
593  "stream\n"
594  << stream2
595  << "endstream\n"
596  "endobj\n";
597  AppendPDFObject(stream.str().c_str());
598 
599  // FONT DESCRIPTOR
600  stream.str("");
601  stream << "7 0 obj\n"
602  "<<\n"
603  " /Ascent 1000\n"
604  " /CapHeight 1000\n"
605  " /Descent -1\n" // Spec says must be negative
606  " /Flags 5\n" // FixedPitch + Symbolic
607  " /FontBBox [ 0 0 "
608  << (1000 / kCharWidth)
609  << " 1000 ]\n"
610  " /FontFile2 8 0 R\n"
611  " /FontName /GlyphLessFont\n"
612  " /ItalicAngle 0\n"
613  " /StemV 80\n"
614  " /Type /FontDescriptor\n"
615  ">>\n"
616  "endobj\n";
617  AppendPDFObject(stream.str().c_str());
618 
619  stream.str("");
620  stream << datadir_.c_str() << "/pdf.ttf";
621  const uint8_t *font;
622  std::ifstream input(stream.str().c_str(), std::ios::in | std::ios::binary);
623  std::vector<unsigned char> buffer(std::istreambuf_iterator<char>(input), {});
624  auto size = buffer.size();
625  if (size) {
626  font = buffer.data();
627  } else {
628 #if !defined(NDEBUG)
629  tprintf("Cannot open file \"%s\"!\nUsing internal glyphless font.\n", stream.str().c_str());
630 #endif
631  font = pdf_ttf;
632  size = sizeof(pdf_ttf);
633  }
634 
635  // FONTFILE2
636  stream.str("");
637  stream << "8 0 obj\n"
638  "<<\n"
639  " /Length "
640  << size
641  << "\n"
642  " /Length1 "
643  << size
644  << "\n"
645  ">>\n"
646  "stream\n";
647  AppendString(stream.str().c_str());
648  objsize = stream.str().size();
649  AppendData(reinterpret_cast<const char *>(font), size);
650  objsize += size;
651  AppendString(endstream_endobj);
652  objsize += strlen(endstream_endobj);
653  AppendPDFObjectDIY(objsize);
654  return true;
655 }
void tprintf(const char *format,...)
Definition: tprintf.cpp:41

◆ EndDocumentHandler()

bool tesseract::TessPDFRenderer::EndDocumentHandler ( )
overrideprotectedvirtual

Reimplemented from tesseract::TessResultRenderer.

Definition at line 891 of file pdfrenderer.cpp.

891  {
892  // We reserved the /Pages object number early, so that the /Page
893  // objects could refer to their parent. We finally have enough
894  // information to go fill it in. Using lower level calls to manipulate
895  // the offset record in two spots, because we are placing objects
896  // out of order in the file.
897 
898  // PAGES
899  const long int kPagesObjectNumber = 2;
900  offsets_[kPagesObjectNumber] = offsets_.back(); // manipulation #1
901  std::stringstream stream;
902  // Use "C" locale (needed for int values larger than 999).
903  stream.imbue(std::locale::classic());
904  stream << kPagesObjectNumber << " 0 obj\n<<\n /Type /Pages\n /Kids [ ";
905  AppendString(stream.str().c_str());
906  size_t pages_objsize = stream.str().size();
907  for (const auto &page : pages_) {
908  stream.str("");
909  stream << page << " 0 R ";
910  AppendString(stream.str().c_str());
911  pages_objsize += stream.str().size();
912  }
913  stream.str("");
914  stream << "]\n /Count " << pages_.size() << "\n>>\nendobj\n";
915  AppendString(stream.str().c_str());
916  pages_objsize += stream.str().size();
917  offsets_.back() += pages_objsize; // manipulation #2
918 
919  // INFO
920  std::string utf16_title = "FEFF"; // byte_order_marker
921  std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(title());
922  char utf16[kMaxBytesPerCodepoint];
923  for (char32 code : unicodes) {
924  if (CodepointToUtf16be(code, utf16)) {
925  utf16_title += utf16;
926  }
927  }
928 
929  char *datestr = l_getFormattedDate();
930  stream.str("");
931  stream << obj_
932  << " 0 obj\n"
933  "<<\n"
934  " /Producer (Tesseract "
936  << ")\n"
937  " /CreationDate (D:"
938  << datestr
939  << ")\n"
940  " /Title <"
941  << utf16_title.c_str()
942  << ">\n"
943  ">>\n"
944  "endobj\n";
945  lept_free(datestr);
946  AppendPDFObject(stream.str().c_str());
947  stream.str("");
948  stream << "xref\n0 " << obj_ << "\n0000000000 65535 f \n";
949  AppendString(stream.str().c_str());
950  for (int i = 1; i < obj_; i++) {
951  stream.str("");
952  stream.width(10);
953  stream.fill('0');
954  stream << offsets_[i] << " 00000 n \n";
955  AppendString(stream.str().c_str());
956  }
957  stream.str("");
958  stream << "trailer\n<<\n /Size " << obj_
959  << "\n"
960  " /Root 1 0 R\n" // catalog
961  " /Info "
962  << (obj_ - 1)
963  << " 0 R\n" // info
964  ">>\nstartxref\n"
965  << offsets_.back() << "\n%%EOF\n";
966  AppendString(stream.str().c_str());
967  return true;
968 }
signed int char32
static const char * Version()
Definition: baseapi.cpp:238
const char * title() const
Definition: renderer.h:88
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
Definition: unichar.cpp:220

The documentation for this class was generated from the following files: