Documentation ¶
Index ¶
- func CompressJPEG(Picture []byte, quality int) (compressed []byte)
- func ContainerExtractFiles(data []byte, ...)
- func DOC2Text(r io.Reader) (io.Reader, error)
- func DOCX2Text(file io.ReaderAt, size int64) (string, error)
- func DecompressFile(data []byte) (decompressed []byte, valid bool)
- func EPUB2Text(file io.ReaderAt, size int64, limit int64) (string, error)
- func HTML2Text(reader io.Reader) (pageText string, err error)
- func HTML2TextAndLinks(reader io.Reader, baseURL string) (pageText string, links []string, err error)
- func InitPDFLicense(key, name string)
- func IsExcessiveLargePicture(Picture []byte) (excessive bool, err error)
- func IsFileDOC(data []byte) bool
- func IsFileDOCX(data []byte) bool
- func IsFileMOBI(data []byte) bool
- func IsFilePPT(data []byte) bool
- func IsFilePPTX(data []byte) bool
- func IsFileRTF(data []byte) bool
- func IsFileXLSX(data []byte) bool
- func IsFileZIP(data []byte) bool
- func Mobi2Text(file io.ReadSeeker) (string, error)
- func ODS2Cells(file io.ReaderAt, size int64) (cells []string, err error)
- func ODS2Text(file io.ReaderAt, size int64, writer io.Writer, limit int64) (written int64, err error)
- func ODT2Text(file io.ReaderAt, size int64, writer io.Writer, limit int64) (written int64, err error)
- func PDFGetCreationDate(f io.ReadSeeker) (date time.Time, valid bool)
- func PDFListContentStreams(f io.ReadSeeker, w io.Writer, size int64) (written int64, err error)
- func PPTX2Text(file io.ReaderAt, size int64) (string, error)
- func RTF2Text(inputRtf string) string
- func ResizeCompressPicture(Picture []byte, Quality int, MaxWidth, MaxHeight uint) (compressed []byte, err error)
- func XLSX2Cells(file io.ReaderAt, size int64, rowLimit int) (cells []string, err error)
- func XLSX2Text(file io.ReaderAt, size int64, writer io.Writer, limit int64, rowLimit int) (written int64, err error)
- type ImageResult
- type PPTXDocument
- type PPTXSlide
- type SlideNumberSorter
- type WordDocument
- type WordParagraph
- type WordRow
- type WordStyle
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func CompressJPEG ¶
CompressJPEG compresses a JPEG picture according to the input Warning: If the image claims to be large (in terms of width & height), this may use a lot of memory. Use IsExcessiveLargePicture first.
func ContainerExtractFiles ¶
func ContainerExtractFiles(data []byte, callback func(name string, size int64, date time.Time, data []byte))
ContainerExtractFiles extracts files from supported containers: ZIP, RAR, 7Z, TAR
func DOC2Text ¶
DOC2Text converts a standard io.Reader from a Microsoft Word .doc binary file and returns a reader (actually a bytes.Buffer) which will output the plain text found in the .doc file
func DOCX2Text ¶
DOCX2Text extracts text of a Word document Size is the full size of the input file.
func DecompressFile ¶
DecompressFile decompresses data. It supports: GZ, BZ, BZ2, XZ
func HTML2TextAndLinks ¶
func HTML2TextAndLinks(reader io.Reader, baseURL string) (pageText string, links []string, err error)
HTML2TextAndLinks extracts the text from the HTML and all links from <a> and <img> tags of a HTML If the base URL is provided, relative links will be converted to absolute ones.
func InitPDFLicense ¶
func InitPDFLicense(key, name string)
InitPDFLicense initializes the PDF license
func IsExcessiveLargePicture ¶
IsExcessiveLargePicture checks if the picture has reasonable width and height, preventing potential DoS when decoding it This protects against this problem: If the image claims to be large (in terms of width & height), jpeg.Decode may use a lot of memory, see https://github.com/golang/go/issues/10532.
func IsFileDOC ¶
IsFileDOC checks if the data indicates a DOC file DOC has multiple signature according to https://filesignatures.net/index.php?search=doc&mode=EXT, D0 CF 11 E0 A1 B1 1A E1
func IsFileDOCX ¶
IsFileDOCX checks if the data indicates a DOCX file DOCX has a signature of 50 4B 03 04
func IsFileMOBI ¶
IsFileMOBI checks if the data indicates a MOBI file
func IsFilePPT ¶
IsFilePPT checks if the data indicates a PPT file PPT has multiple signature according to https://www.filesignatures.net/index.php?page=search&search=PPT&mode=EXT, D0 CF 11 E0 A1 B1 1A E1. This overlaps with others (including DOC ans XLS).
func IsFilePPTX ¶
IsFilePPTX checks if the data indicates a PPTX file PPTX has a signature of 50 4B 03 04 Warning: This collides with ZIP, DOCX and other zip-based files.
func IsFileRTF ¶
IsFileRTF checks if the data indicates a RTF file RTF has a signature of 7B 5C 72 74 66 31, or in string "{\rtf1"
func IsFileXLSX ¶
IsFileXLSX checks if the data indicates a XLSX file XLSX has a signature of 50 4B 03 04 Warning: This collides with ZIP, DOCX and other zip-based files.
func IsFileZIP ¶
IsFileZIP checks if the data indicates a ZIP file. Many file formats like DOCX, XLSX, PPTX and APK are actual ZIP files. Signature 50 4B 03 04
func Mobi2Text ¶
func Mobi2Text(file io.ReadSeeker) (string, error)
Mobi2Text converts a MOBI ebook to text
func ODS2Cells ¶
ODS2Cells converts an ODS file to individual cells Size is the full size of the input file.
func ODS2Text ¶
func ODS2Text(file io.ReaderAt, size int64, writer io.Writer, limit int64) (written int64, err error)
ODS2Text extracts text of an OpenDocument Spreadsheet Size is the full size of the input file.
func ODT2Text ¶
func ODT2Text(file io.ReaderAt, size int64, writer io.Writer, limit int64) (written int64, err error)
ODT2Text extracts text of an OpenDocument Text file Size is the full size of the input file.
func PDFGetCreationDate ¶
func PDFGetCreationDate(f io.ReadSeeker) (date time.Time, valid bool)
PDFGetCreationDate tries to get the creation date
func PDFListContentStreams ¶
PDFListContentStreams writes all text streams in a PDF to the writer It returns the number of characters attempted written (excluding "Page N" and new-lines) and an error, if any. It can be used to determine whether any text was extracted. The parameter size is the max amount of bytes (not characters) to write out.
func PPTX2Text ¶
PPTX2Text extracts text of a PowerPoint document Size is the full size of the input file.
func ResizeCompressPicture ¶
func ResizeCompressPicture(Picture []byte, Quality int, MaxWidth, MaxHeight uint) (compressed []byte, err error)
ResizeCompressPicture scales a picture down and compresses it. It accepts GIF, JPEG, PNG as input but output will always be JPEG. Quality specifies the output JPEG quality 0-100. Anything below 75 will noticably reduce the picture quality. Warning: If the image claims to be large (in terms of width & height), this may use a lot of memory. Use IsExcessiveLargePicture first. Scaling a picture down is optional and only done if MaxWidth and MaxHeight are not 0. Even without rescaling, this function is useful to convert a picture into JPEG.
func XLSX2Cells ¶
XLSX2Cells converts an XLSX file to individual cells Size is the full size of the input file. rowLimit defines how many rows per sheet to extract. -1 means unlimited. This exists as protection against some XLSX files that may use excessive amount of memory.
func XLSX2Text ¶
func XLSX2Text(file io.ReaderAt, size int64, writer io.Writer, limit int64, rowLimit int) (written int64, err error)
XLSX2Text extracts text of an Excel sheet Size is the full size of the input file. Limit is the output limit in bytes. rowLimit defines how many rows per sheet to extract. -1 means unlimited. This exists as protection against some XLSX files that may use excessive amount of memory.
Types ¶
type ImageResult ¶
ImageResult contains an extracted image
func PDFExtractImages ¶
func PDFExtractImages(input io.ReadSeeker) (images []ImageResult, err error)
PDFExtractImages extracts all images from a PDF file
type PPTXDocument ¶
type PPTXDocument struct {
Slides []PPTXSlide
}
PPTXDocument is a PPTX document loaded into memory
func (PPTXDocument) AsText ¶
func (doc PPTXDocument) AsText() (text string)
AsText returns the text on all slides
type SlideNumberSorter ¶
type SlideNumberSorter []PPTXSlide
SlideNumberSorter is used for sorting
func (SlideNumberSorter) Len ¶
func (a SlideNumberSorter) Len() int
func (SlideNumberSorter) Less ¶
func (a SlideNumberSorter) Less(i, j int) bool
func (SlideNumberSorter) Swap ¶
func (a SlideNumberSorter) Swap(i, j int)
type WordDocument ¶
type WordDocument struct {
Paragraphs []WordParagraph
}
WordDocument is a full word doc
func (WordDocument) AsText ¶
func (w WordDocument) AsText() string
AsText returns all text in the document
type WordParagraph ¶
WordParagraph is a single paragraph