Documentation
¶
Index ¶
- func BatchPdfToPng(pdfDir, imageDir string) error
- func CleanupImages(commonDirs *config.CommonDirs) model.CliFunc
- func DownloadPdfsCmd(commonDirs *config.CommonDirs) model.CliFunc
- func DownloadUrlsCmd(commonDirs *config.CommonDirs) model.CliFunc
- func OcrImages(commonDirs *config.CommonDirs) model.CliFunc
- func PdfToPng(commonDirs *config.CommonDirs) model.CliFunc
- func UpdateBucketItemIndex(commonDirs *config.CommonDirs) model.CliFunc
- func UpdateFolders(commonDirs *config.CommonDirs) model.CliFunc
- func UploadPdfs(commonDirs *config.CommonDirs) model.CliFunc
- func WriteImage(result ConversionResult) error
- type ConversionResult
- type ImageExtractor
- type ImageIterator
- type NestedIterator
- type PdfConverter
- func (p *PdfConverter) ConvertIfNotPresent(extension string) error
- func (p *PdfConverter) CreateImageDir() error
- func (p *PdfConverter) CreateImageFile(pageNumber int, extension string) (*os.File, error)
- func (p *PdfConverter) GetImageName(pageNumber int, extension string) string
- func (p *PdfConverter) ImageDirExists() (bool, error)
- type PdfConverterV2
- type TessClient
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func BatchPdfToPng ¶
func CleanupImages ¶
func CleanupImages(commonDirs *config.CommonDirs) model.CliFunc
func DownloadPdfsCmd ¶
func DownloadPdfsCmd(commonDirs *config.CommonDirs) model.CliFunc
func DownloadUrlsCmd ¶
func DownloadUrlsCmd(commonDirs *config.CommonDirs) model.CliFunc
func OcrImages ¶
func OcrImages(commonDirs *config.CommonDirs) model.CliFunc
OcrImages TODO 1. Reuse client, no need to re-create. Just SetImage for each TODO 2. Error Groups golang.org/x/sync/errgroup Instead of sending errors from goroutines to the main go routine TODO 3. Batch or Async IO operations - batch opening/closing files or async TODO 4. Buffered channels for done and failed chans TODO 5. Don't double-loop over the images TODO 6. Logging in a tight loop can be a performance hit Aggregate logs and use less frequently. Probably log per n number of files completed TODO 7. Parallel write files
func PdfToPng ¶
func PdfToPng(commonDirs *config.CommonDirs) model.CliFunc
PdfToPng converts PDF files to PNG files
func UpdateBucketItemIndex ¶
func UpdateBucketItemIndex(commonDirs *config.CommonDirs) model.CliFunc
func UpdateFolders ¶
func UpdateFolders(commonDirs *config.CommonDirs) model.CliFunc
func UploadPdfs ¶
func UploadPdfs(commonDirs *config.CommonDirs) model.CliFunc
func WriteImage ¶
func WriteImage(result ConversionResult) error
Types ¶
type ConversionResult ¶
type ImageExtractor ¶
type ImageExtractor struct { TessClient *TessClient ImagePath string CsvPath string CommonDirs *config.CommonDirs }
func NewImageExtractor ¶
func NewImageExtractor(tessClient *TessClient, imagePath string, commonDirs *config.CommonDirs) *ImageExtractor
func (*ImageExtractor) CsvExists ¶
func (i *ImageExtractor) CsvExists() (bool, error)
func (*ImageExtractor) ExtractIfNotExists ¶
func (i *ImageExtractor) ExtractIfNotExists() ([]*model.OcrResult, error)
func (*ImageExtractor) WriteResults ¶
func (i *ImageExtractor) WriteResults(results []*model.OcrResult) error
type ImageIterator ¶
type ImageIterator struct { BaseDir string SubDirs []fs.DirEntry CurrentDir string CurrentImages []fs.DirEntry CurrentIdx int SubDirIdx int }
func NewImageIterator ¶
func NewImageIterator(baseDir string) (*ImageIterator, error)
func (*ImageIterator) GetNext ¶
func (it *ImageIterator) GetNext() string
func (*ImageIterator) HasNext ¶
func (it *ImageIterator) HasNext() bool
type NestedIterator ¶
type PdfConverter ¶
type PdfConverter struct { PdfPath string CommonDirs *config.CommonDirs BaseFileName string ImageDir string }
func NewPdfConverter ¶
func NewPdfConverter(pdfPath string, commonDirs *config.CommonDirs) *PdfConverter
func (*PdfConverter) ConvertIfNotPresent ¶
func (p *PdfConverter) ConvertIfNotPresent(extension string) error
func (*PdfConverter) CreateImageDir ¶
func (p *PdfConverter) CreateImageDir() error
func (*PdfConverter) CreateImageFile ¶
func (*PdfConverter) GetImageName ¶
func (p *PdfConverter) GetImageName(pageNumber int, extension string) string
func (*PdfConverter) ImageDirExists ¶
func (p *PdfConverter) ImageDirExists() (bool, error)
type PdfConverterV2 ¶
func NewPdfConverterV2 ¶
func NewPdfConverterV2(pdfPath, imageDir string) *PdfConverterV2
NewPdfConverterV2 TODO - use a flat file structure instead of the nested structure images will be organized by {name}-{page no}.png, so they'll maintain order
func (*PdfConverterV2) ConvertPagesToImages ¶
func (p *PdfConverterV2) ConvertPagesToImages(extension string) ([]ConversionResult, error)
func (*PdfConverterV2) GetImageName ¶
func (p *PdfConverterV2) GetImageName(pageNumber int, extension string) string
type TessClient ¶
type TessClient struct { TessDataPrefix string Language string ImagePath string Client *gosseract.Client }
func NewTessClientDefault ¶
func NewTessClientDefault() (*TessClient, error)
func (*TessClient) ExtractImageToResults ¶
func (t *TessClient) ExtractImageToResults(imagePath string) ([]*model.OcrResult, error)