pdf_parser

package module
v0.1.97 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 28, 2024 License: MIT Imports: 11 Imported by: 0

README

Pdf metadata parser

Go library for parsing pdf metadata information

License

MIT

Usage
import "github.com/gekkowrld/pdf_parser"

// parse file
pdf, errors := pdf_parser.ParsePdf("filepath/file.pdf")

// main functions
pdf.GetTitle()
pdf.GetAuthor()
pdf.GetCreator()
pdf.GetISBN()
pdf.GetPublishers() []string
pdf.GetLanguages() []string
pdf.GetDescription()
pdf.GetPagesCount()

Using with custom github.com/sirupsen/logrus logger

import "github.com/gekkowrld/pdf_parser"

l := logger.New()
l.SetOutput(os.Stdout)
lg.SetFormatter(&logger.JSONFormatter{})

SetLogger(lg)
file, _ := filepath.Abs("filepath/file.pdf")
pdf, err := ParsePdf(file)

Documentation

Index

Constants

View Source
const BufferSize = 50
View Source
const BufferSize300 = 300

Variables

This section is empty.

Functions

func SetLogger

func SetLogger(logrusLogger *logger.Logger)

Set logrus logger to pdf parser instance

Types

type InfoObject

type InfoObject struct {
	Title        string
	Author       string
	Creator      string
	CreationDate string
	Producer     string
	ModDate      string
}

type MetaDataRdf

type MetaDataRdf struct {
	Title       string
	Description string
	Creator     string
	Date        string
	Isbn        string

	Publishers []string
	Languages  []string
}

type Metadata

type Metadata struct {
	Type          string
	Subtype       string
	Length        int64
	DL            int64
	RawStreamData []byte
	RdfMeta       *MetaDataRdf
}

type ObjectIdentifier

type ObjectIdentifier struct {
	ObjectNumber     int
	GenerationNumber int
	KeyWord          string
}

type ObjectSubsection

type ObjectSubsection struct {
	Id                      int // objectId
	ObjectsCount            int
	FirstSubsectionObjectId int
	LastSubsectionObjectId  int
	Elements                map[int]*ObjectSubsectionElement
}

Object subsection that contain list of objects for this object

type ObjectSubsectionElement

type ObjectSubsectionElement struct {
	Id               int
	ObjectNumber     int
	GenerationNumber int
	KeyWord          string
}

type PdfInfo

type PdfInfo struct {
	PdfVersion               string
	OriginalXrefOffset       int64
	OriginalTrailerSection   TrailerSection
	AdditionalTrailerSection []*TrailerSection
	XrefTable                []*XrefTable
	Root                     RootObject
	Info                     InfoObject
	Metadata                 Metadata
	PagesCount               int
}

func ParsePdf

func ParsePdf(fileName string) (*PdfInfo, error)

Parse pdf file metadata

func (*PdfInfo) GetAuthor

func (pdf *PdfInfo) GetAuthor() string

func (*PdfInfo) GetCover

func (pdf *PdfInfo) GetCover(filepath string) bool

func (*PdfInfo) GetCreator

func (pdf *PdfInfo) GetCreator() string

func (*PdfInfo) GetDate

func (pdf *PdfInfo) GetDate() string

func (*PdfInfo) GetDescription

func (pdf *PdfInfo) GetDescription() string

func (*PdfInfo) GetISBN

func (pdf *PdfInfo) GetISBN() string

func (*PdfInfo) GetLanguage

func (pdf *PdfInfo) GetLanguage() string

func (*PdfInfo) GetLanguages

func (pdf *PdfInfo) GetLanguages() []string

func (*PdfInfo) GetPagesCount

func (pdf *PdfInfo) GetPagesCount() int

func (*PdfInfo) GetPublisherInfo

func (pdf *PdfInfo) GetPublisherInfo() string

func (*PdfInfo) GetPublishers

func (pdf *PdfInfo) GetPublishers() []string

func (*PdfInfo) GetTitle

func (pdf *PdfInfo) GetTitle() string

type RootObject

type RootObject struct {
	Type       string
	Pages      *ObjectIdentifier
	Metadata   *ObjectIdentifier
	PageLabels *ObjectIdentifier
	Lang       string
}

type TrailerSection

type TrailerSection struct {
	IdRaw string
	Info  ObjectIdentifier
	Root  ObjectIdentifier
	Size  string
	Prev  int64
}

type XrefTable

type XrefTable struct {
	Objects           map[int]*ObjectSubsectionElement
	ObjectSubsections map[int]*ObjectSubsection
	SectionStart      int64
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL