Documentation ¶
Index ¶
- Constants
- func CreateID() (string, error)
- func Marshal(v interface{}) ([]byte, error)
- func ParseID(id string) (uuid.UUID, error)
- func Unmarshal(data []byte, v interface{}) error
- func UnmarshalStream(reader *bufio.Reader, v interface{}) error
- type File
- type Header
- type IPayload
- type InfoPayload
- type MetadataPayload
- type PayloadParser
- type RawPayload
- type Reader
- func (reader *Reader) ReadAll() (*File, error)
- func (reader *Reader) ReadAllHeaders() (*File, error)
- func (reader *Reader) ReadHeader() (*Header, error)
- func (reader *Reader) ReadPayload(header *Header) (IPayload, error)
- func (reader *Reader) ReadRecord() (*Record, error)
- func (reader *Reader) ReadRecordHeader() (*Record, error)
- func (reader *Reader) RegisterPayloadParser(recordType string, parser PayloadParser)
- type Record
Constants ¶
const ( // TypeInfo is a type of record that contains information about the file. TypeInfo string = "warcinfo" // TypeResponse is a type of record that is a complete schema-specific response, including network protocol information where possible. TypeResponse = "response" // TypeResource is a type of record that holds a aesource, without full protocol response information, such as a file locally accessible. TypeResource = "resource" // TypeRequest is a type of record that holds the details of a complete schema-specific request, including network protocol information where possible. TypeRequest = "request" // TypeMetadata is a type of record that ... TypeMetadata = "metadata" // TypeRevisit is a type of record that describes the revisitation of content already archived. TypeRevisit = "revisit" // TypeConversion is a type of record that describes an alternative version of another record's content that was created during the archival process. TypeConversion = "conversion" // TypeContinuation is a type of a record appended to corresponding prior record block(s) to create a logically complete full-sized original record. TypeContinuation = "continuation" )
const ( // TruncatedReasonLength is the reason for a record to be truncated due to a record exceeding a configured max length. TruncatedReasonLength string = "length" // TruncatedReasonTime is the reason for a record to be truncated due to a process exceeding a configured max time. TruncatedReasonTime = "time" // TruncatedReasonDisconnect is the reason for a record to be truncated due to a network disconnect. TruncatedReasonDisconnect = "disconnect" // TruncatedReasonUnspecified is the reason for a record to be truncated due to other or unknown issues. TruncatedReasonUnspecified = "unspecified" )
const AnyRecordType = "*"
AnyRecordType depicts a catch all record type.
Variables ¶
This section is empty.
Functions ¶
func UnmarshalStream ¶
UnmarshalStream sets values in a structure from a `key: value` format.
Types ¶
type File ¶
type File struct { // Records are all the records contained within the file. Records []*Record }
File is a WARC file containing one or more records.
type Header ¶
type Header struct { // Type (WARC-Type). Mandatory. Type string `warc:"WARC-Type"` // RecordID (WARC-Record-ID) is a globally unique identifier. Must be a valid URI. Mandatory. RecordID string `warc:"WARC-Record-ID"` // Date (WARC-Date) is the time at which the data capture for the record began. Mandatory. Date time.Time `warc:"WARC-Date"` // ContentLength is the number of octets in the block. If no block is present 0 is used. Mandatory. ContentLength uint64 `warc:"Content-Length"` // ContentType is the RFC2045 MIME type of the information in the record's block. Mandatory for non-empty, non-continuation records. ContentType string `warc:"Content-Type,omitempty"` // ConcurrentTo (WARC-Concurrent-To) is the Record IDs of any records created as part of the same capture as the current record. Must be a valid URI. ConcurrentTo string `warc:"WARC-Concurrent-To,omitempty"` // BlockDigest (WARC-Block-Digest) is a digest of the full block of the record. Format algorithm:digest. BlockDigest string `warc:"WARC-Block-Digest,omitempty"` // PayloadDigest (WARC-Payload-Digest) is a digest of the payload of the record. Format algorithm:digest. PayloadDigest string `warc:"WARC-Payload-Digest,omitempty"` // IPAddress (WARC-IP-Address) is the IPv4 or IPv6 address of the server giving the response. IPAddress string `warc:"WARC-IP-Address,omitempty"` // RefersTo (WARC-Refers-To) is the Record ID of a single record for which the present record holds additional content. RefersTo string `warc:"WARC-Refers-To,omitempty"` // TargetURI (WARC-Target-URI) is the original URI whose capture gave rise to the information content in this record. TargetURI string `warc:"WARC-Target-URI,omitempty"` // Truncated (WARC-Truncated) is the reason a record payload was truncated. Truncated string `warc:"WARC-Truncated,omitempty"` // InfoID (WARC-Warcinfo-ID) indicates the Record ID of the associated "warcinfo" record for this record. InfoID string `warc:"WARC-Warcinfo-ID,omitempty"` // Filename (WARC-Filename) is the filename containing the current "warcinfo" record. Filename string `warc:"WARC-Filename,omitempty"` // Profile (WARC-Profile) is a URI signifying the kind of analysis and handling applied in a "revisit" record. Profile string `warc:"WARC-Profile,omitempty"` // IdentifiedPayloadType (WARC-Identified-Payload-Type) is the content-type of the record's payload as determined by an independent check. IdentifiedPayloadType string `warc:"WARC-Identified-Payload-Type,omitempty"` // SegmentNumber (WARC-Segment-Number) reports the current record's relative ordering in a sequence of segmented records. Mandatory for "continuation" records. SegmentNumber uint64 `warc:"WARC-Segment-Number"` // SegmentOriginID (WARC-Segment-Origin-ID) identifies the starting record in a series of segmented records. SegmentOriginID string `warc:"WARC-Segment-Origin-ID,omitempty"` // SegmentTotalLength (WARC-Segment-Total-Length) reports the total length of all segment content blocks when concatenated together. SegmentTotalLength uint64 `warc:"WARC-Segment-Total-Length"` // contains filtered or unexported fields }
Header is a WARC header containing information about the payload.
type IPayload ¶
type IPayload interface { // Write writes the payload to a stream. Write(writer io.Writer) error // Bytes returns the byte representation of the payload. Bytes() ([]byte, error) // String converts the payload into a string. String() (string, error) // Reader returns a reader for the data. Reader() io.Reader }
IPayload is any payload of a WARC Record.
func ParseInfoPayload ¶
ParseInfoPayload parses a WARC info record's payload.
func ParseMetadataPayload ¶
ParseMetadataPayload parses a WARC metadata record's payload.
type InfoPayload ¶
type InfoPayload struct { RawPayload // Operator contains contact information for the operator who created this WARC resource. Operator string `warc:"operator,omitempty"` // Software is the software and version used to create this WARC resource. Software string `warc:"software,omitempty"` // Robots is the robots policy followed by the harvester creating this WARC resource. Robots string `warc:"robots,omitempty"` // Hostname is the hostname of the machine that created this WARC resource. Hostname string `warc:"hostname,omitempty"` // The IP address of the machine that created this WARC resource. IP string `warc:"ip,omitempty"` // UserAgent is the HTTP 'user-agent' header usually sent by the harvester along with each request. UserAgent string `warc:"http-header-user-agent,omitempty"` // From is the HTTP 'From' header usually sent by the harvester along with each request. From string `warc:"http-header-from,omitempty"` }
InfoPayload is the payload of a "warcinfo" record.
type MetadataPayload ¶
type MetadataPayload struct { RawPayload // Via is the referring URI from which the archived URI was discorvered. Via string `warc:"via,omitempty"` // HopsFromSeed describes the type of each hop from a starting URI to the current URI. HopsFromSeed string `warc:"hopsFromSeed,omitempty"` // FetchTime is the time that it took to collect the archived URI, starting from the initation of network traffic. FetchTime time.Duration `warc:"fetchTimeMs"` }
MetadataPayload is a payload record that contains content created in order to further describe a harvested resource.
type PayloadParser ¶
PayloadParser is a parser invoked to unmarshal a payload. TODO: give the payload parser a "scoped" reader, with bounds only for the payload?
type RawPayload ¶
type RawPayload struct { // Data is the raw data of the payload Data []byte // Length is the length of the data in bytes Length uint64 }
RawPayload is a base IPayload implementation for raw bytes.
func (*RawPayload) Bytes ¶
func (payload *RawPayload) Bytes() ([]byte, error)
Bytes returns the byte representation of the payload.
func (*RawPayload) Reader ¶
func (payload *RawPayload) Reader() io.Reader
Reader returns a reader for the data.
func (*RawPayload) String ¶
func (payload *RawPayload) String() (string, error)
String converts the payload into a string.
type Reader ¶
type Reader struct { Seekable bool // contains filtered or unexported fields }
Reader is a WARC reader.
func NewReader ¶
func NewReader(reader io.ReadSeeker, compressed bool) (*Reader, error)
NewReader creates a new reader. Compressed specifies whether or not the stream is compressed using gzip.
func (*Reader) ReadAllHeaders ¶
ReadAllHeaders reads all records' headers, excluding their payloads.
func (*Reader) ReadHeader ¶
ReadHeader reads a header. Returns nil if none was read (EOF).
func (*Reader) ReadPayload ¶
ReadPayload reads a payload. Returns nil if none was read (EOF). May be invoked at any time as the function seeks to the correct place in the archive to read the payload.
func (*Reader) ReadRecord ¶
ReadRecord reads the next record.
func (*Reader) ReadRecordHeader ¶
ReadRecordHeader reads the next record, skipping the payload.
func (*Reader) RegisterPayloadParser ¶
func (reader *Reader) RegisterPayloadParser(recordType string, parser PayloadParser)
RegisterPayloadParser registers a parser to be invoked after reading a header for the specific recordType. An asterix ('*') may be used to register the any parser. By default this parser will parse payloads as RawPayload.
type Record ¶
type Record struct { // Header is a WARC header containing information about the payload. Guaranteed to exist. Header *Header // Payload is the body of the record. May not exist. Payload IPayload }
Record is a WARC record.