warc

package
v0.0.0-...-4f67eda Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jul 23, 2024 License: MIT Imports: 10 Imported by: 0

Documentation

Index

Constants

View Source
const (
	// TypeInfo is a type of record that contains information about the file.
	TypeInfo string = "warcinfo"
	// TypeResponse is a type of record that is a complete schema-specific response, including network protocol information where possible.
	TypeResponse = "response"
	// TypeResource is a type of record that holds a aesource, without full protocol response information, such as a file locally accessible.
	TypeResource = "resource"
	// TypeRequest is a type of record that holds the details of a complete schema-specific request, including network protocol information where possible.
	TypeRequest = "request"
	// TypeMetadata is a type of record that ...
	TypeMetadata = "metadata"
	// TypeRevisit is a type of record that describes the revisitation of content already archived.
	TypeRevisit = "revisit"
	// TypeConversion is a type of record that describes an alternative version of another record's content that was created during the archival process.
	TypeConversion = "conversion"
	// TypeContinuation is a type of a record appended to corresponding prior record block(s) to create a logically complete full-sized original record.
	TypeContinuation = "continuation"
)
View Source
const (
	// TruncatedReasonLength is the reason for a record to be truncated due to a record exceeding a configured max length.
	TruncatedReasonLength string = "length"
	// TruncatedReasonTime is the reason for a record to be truncated due to a process exceeding a configured max time.
	TruncatedReasonTime = "time"
	// TruncatedReasonDisconnect is the reason for a record to be truncated due to a network disconnect.
	TruncatedReasonDisconnect = "disconnect"
	// TruncatedReasonUnspecified is the reason for a record to be truncated due to other or unknown issues.
	TruncatedReasonUnspecified = "unspecified"
)
View Source
const AnyRecordType = "*"

AnyRecordType depicts a catch all record type.

Variables

This section is empty.

Functions

func CreateID

func CreateID() (string, error)

CreateID creates a valid WARC Record ID using a UUID URN.

func Marshal

func Marshal(v interface{}) ([]byte, error)

Marshal converts an interface to a `key: value` format.

func ParseID

func ParseID(id string) (uuid.UUID, error)

ParseID parsers aa WARC Record ID and returns a UUID.

func Unmarshal

func Unmarshal(data []byte, v interface{}) error

Unmarshal sets values in a structure from a `key: value` format.

func UnmarshalStream

func UnmarshalStream(reader *bufio.Reader, v interface{}) error

UnmarshalStream sets values in a structure from a `key: value` format.

Types

type File

type File struct {
	// Records are all the records contained within the file.
	Records []*Record
}

File is a WARC file containing one or more records.

func (*File) String

func (file *File) String() string

String converts the file into a string

func (*File) Write

func (file *File) Write(writer io.Writer, compress bool)

Write writes the file to a stream.

func (*File) WriteHeaders

func (file *File) WriteHeaders(writer io.Writer, compress bool)

WriteHeaders write only the headers to a stream (with all payloads being empty).

type Header struct {
	// Type (WARC-Type). Mandatory.
	Type string `warc:"WARC-Type"`
	// RecordID (WARC-Record-ID) is a globally unique identifier. Must be a valid URI. Mandatory.
	RecordID string `warc:"WARC-Record-ID"`
	// Date (WARC-Date) is the time at which the data capture for the record began. Mandatory.
	Date time.Time `warc:"WARC-Date"`
	// ContentLength is the number of octets in the block. If no block is present 0 is used. Mandatory.
	ContentLength uint64 `warc:"Content-Length"`
	// ContentType is the RFC2045 MIME type of the information in the record's block. Mandatory for non-empty, non-continuation records.
	ContentType string `warc:"Content-Type,omitempty"`
	// ConcurrentTo (WARC-Concurrent-To) is the Record IDs of any records created as part of the same capture as the current record. Must be a valid URI.
	ConcurrentTo string `warc:"WARC-Concurrent-To,omitempty"`
	// BlockDigest (WARC-Block-Digest) is a digest of the full block of the record. Format algorithm:digest.
	BlockDigest string `warc:"WARC-Block-Digest,omitempty"`
	// PayloadDigest  (WARC-Payload-Digest) is a digest of the payload of the record. Format algorithm:digest.
	PayloadDigest string `warc:"WARC-Payload-Digest,omitempty"`
	// IPAddress (WARC-IP-Address) is the IPv4 or IPv6 address of the server giving the response.
	IPAddress string `warc:"WARC-IP-Address,omitempty"`
	// RefersTo (WARC-Refers-To) is the Record ID of a single record for which the present record holds additional content.
	RefersTo string `warc:"WARC-Refers-To,omitempty"`
	// TargetURI (WARC-Target-URI) is the original URI whose capture gave rise to the information content in this record.
	TargetURI string `warc:"WARC-Target-URI,omitempty"`
	// Truncated (WARC-Truncated) is the reason a record payload was truncated.
	Truncated string `warc:"WARC-Truncated,omitempty"`
	// InfoID (WARC-Warcinfo-ID) indicates the Record ID of the associated "warcinfo" record for this record.
	InfoID string `warc:"WARC-Warcinfo-ID,omitempty"`
	// Filename (WARC-Filename) is the filename containing the current "warcinfo" record.
	Filename string `warc:"WARC-Filename,omitempty"`
	// Profile (WARC-Profile) is a URI signifying the kind of analysis and handling applied in a "revisit" record.
	Profile string `warc:"WARC-Profile,omitempty"`
	// IdentifiedPayloadType (WARC-Identified-Payload-Type) is the content-type of the record's payload as determined by an independent check.
	IdentifiedPayloadType string `warc:"WARC-Identified-Payload-Type,omitempty"`
	// SegmentNumber (WARC-Segment-Number) reports the current record's relative ordering in a sequence of segmented records. Mandatory for "continuation" records.
	SegmentNumber uint64 `warc:"WARC-Segment-Number"`
	// SegmentOriginID (WARC-Segment-Origin-ID) identifies the starting record in a series of segmented records.
	SegmentOriginID string `warc:"WARC-Segment-Origin-ID,omitempty"`
	// SegmentTotalLength (WARC-Segment-Total-Length) reports the total length of all segment content blocks when concatenated together.
	SegmentTotalLength uint64 `warc:"WARC-Segment-Total-Length"`
	// contains filtered or unexported fields
}

Header is a WARC header containing information about the payload.

func (*Header) String

func (header *Header) String() (string, error)

String converts the header into a string

func (*Header) Write

func (header *Header) Write(writer io.Writer) error

Write writes the header to a stream.

type IPayload

type IPayload interface {
	// Write writes the payload to a stream.
	Write(writer io.Writer) error
	// Bytes returns the byte representation of the payload.
	Bytes() ([]byte, error)
	// String converts the payload into a string.
	String() (string, error)
	// Reader returns a reader for the data.
	Reader() io.Reader
}

IPayload is any payload of a WARC Record.

func ParseInfoPayload

func ParseInfoPayload(payload IPayload, header *Header) (IPayload, error)

ParseInfoPayload parses a WARC info record's payload.

func ParseMetadataPayload

func ParseMetadataPayload(payload IPayload, header *Header) (IPayload, error)

ParseMetadataPayload parses a WARC metadata record's payload.

func RawPayloadParser

func RawPayloadParser(reader *bufio.Reader, header *Header) (IPayload, error)

RawPayloadParser is a payload parser for reading raw payloads.

type InfoPayload

type InfoPayload struct {
	RawPayload
	// Operator contains contact information for the operator who created this WARC resource.
	Operator string `warc:"operator,omitempty"`
	// Software is the software and version used to create this WARC resource.
	Software string `warc:"software,omitempty"`
	// Robots is the robots policy followed by the harvester creating this WARC resource.
	Robots string `warc:"robots,omitempty"`
	// Hostname is the hostname of the machine that created this WARC resource.
	Hostname string `warc:"hostname,omitempty"`
	// The IP address of the machine that created this WARC resource.
	IP string `warc:"ip,omitempty"`
	// UserAgent is the HTTP 'user-agent' header usually sent by the harvester along with each request.
	UserAgent string `warc:"http-header-user-agent,omitempty"`
	// From is the HTTP 'From' header usually sent by the harvester along with each request.
	From string `warc:"http-header-from,omitempty"`
}

InfoPayload is the payload of a "warcinfo" record.

type MetadataPayload

type MetadataPayload struct {
	RawPayload
	// Via is the referring URI from which the archived URI was discorvered.
	Via string `warc:"via,omitempty"`
	// HopsFromSeed describes the type of each hop from a starting URI to the current URI.
	HopsFromSeed string `warc:"hopsFromSeed,omitempty"`
	// FetchTime is the time that it took to collect the archived URI, starting from the initation of network traffic.
	FetchTime time.Duration `warc:"fetchTimeMs"`
}

MetadataPayload is a payload record that contains content created in order to further describe a harvested resource.

type PayloadParser

type PayloadParser func(reader *bufio.Reader, header *Header) (IPayload, error)

PayloadParser is a parser invoked to unmarshal a payload. TODO: give the payload parser a "scoped" reader, with bounds only for the payload?

type RawPayload

type RawPayload struct {
	// Data is the raw data of the payload
	Data []byte
	// Length is the length of the data in bytes
	Length uint64
}

RawPayload is a base IPayload implementation for raw bytes.

func (*RawPayload) Bytes

func (payload *RawPayload) Bytes() ([]byte, error)

Bytes returns the byte representation of the payload.

func (*RawPayload) Reader

func (payload *RawPayload) Reader() io.Reader

Reader returns a reader for the data.

func (*RawPayload) String

func (payload *RawPayload) String() (string, error)

String converts the payload into a string.

func (*RawPayload) Write

func (payload *RawPayload) Write(writer io.Writer) error

Write writes the payload to a stream.

type Reader

type Reader struct {
	Seekable bool
	// contains filtered or unexported fields
}

Reader is a WARC reader.

func NewReader

func NewReader(reader io.ReadSeeker, compressed bool) (*Reader, error)

NewReader creates a new reader. Compressed specifies whether or not the stream is compressed using gzip.

func (*Reader) ReadAll

func (reader *Reader) ReadAll() (*File, error)

ReadAll reads all records, including their payloads.

func (*Reader) ReadAllHeaders

func (reader *Reader) ReadAllHeaders() (*File, error)

ReadAllHeaders reads all records' headers, excluding their payloads.

func (*Reader) ReadHeader

func (reader *Reader) ReadHeader() (*Header, error)

ReadHeader reads a header. Returns nil if none was read (EOF).

func (*Reader) ReadPayload

func (reader *Reader) ReadPayload(header *Header) (IPayload, error)

ReadPayload reads a payload. Returns nil if none was read (EOF). May be invoked at any time as the function seeks to the correct place in the archive to read the payload.

func (*Reader) ReadRecord

func (reader *Reader) ReadRecord() (*Record, error)

ReadRecord reads the next record.

func (*Reader) ReadRecordHeader

func (reader *Reader) ReadRecordHeader() (*Record, error)

ReadRecordHeader reads the next record, skipping the payload.

func (*Reader) RegisterPayloadParser

func (reader *Reader) RegisterPayloadParser(recordType string, parser PayloadParser)

RegisterPayloadParser registers a parser to be invoked after reading a header for the specific recordType. An asterix ('*') may be used to register the any parser. By default this parser will parse payloads as RawPayload.

type Record

type Record struct {
	// Header is a WARC header containing information about the payload. Guaranteed to exist.
	Header *Header
	// Payload is the body of the record. May not exist.
	Payload IPayload
}

Record is a WARC record.

func (*Record) String

func (record *Record) String() (string, error)

String converts the record into a string

func (*Record) Write

func (record *Record) Write(writer io.Writer) error

Write writes the record to a stream.

func (*Record) WriteHeader

func (record *Record) WriteHeader(writer io.Writer) error

WriteHeader writes the record's header and an empty payload to a stream.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL