Documentation ¶
Index ¶
- Variables
- type WARCFile
- type WARCHeader
- type WARCReader
- type WARCRecord
- func (wr *WARCRecord) Get(name string) (string, bool)
- func (wr *WARCRecord) GetChecksum() string
- func (wr *WARCRecord) GetDate() string
- func (wr *WARCRecord) GetHeader() *WARCHeader
- func (wr *WARCRecord) GetIpAddress() string
- func (wr *WARCRecord) GetPayload() *utils.FilePart
- func (wr *WARCRecord) GetType() string
- func (wr *WARCRecord) GetUrl() string
- func (wr *WARCRecord) Offset() int
- func (wr *WARCRecord) Set(name string, value string)
Constants ¶
This section is empty.
Variables ¶
var CONTENT_TYPES map[string]string = map[string]string{
"warcinfo": "application/warc-fields",
"response": "application/http; msgtype=response",
"request": "application/http; msgtype=request",
"metadata": "application/warc-fields",
}
var KNOWN_HEADERS map[string]string = map[string]string{
"type": "WARC-Type",
"date": "WARC-Date",
"record_id": "WARC-Record-ID",
"ip_address": "WARC-IP-Address",
"target_uri": "WARC-Target-URI",
"warcinfo_id": "WARC-Warcinfo-ID",
"request_uri": "WARC-Request-URI",
"content_type": "Content-Type",
"content_length": "Content-Length",
}
var RE_HEADER *regexp.Regexp = regexp.MustCompile("([a-zA-Z_\\-]+): *(.*)\r\n")
var RE_VERSION *regexp.Regexp = regexp.MustCompile("WARC/(\\d+.\\d+)\r\n")
Functions ¶
This section is empty.
Types ¶
type WARCFile ¶
type WARCFile struct {
// contains filtered or unexported fields
}
func NewWARCFile ¶
func NewWARCFile(reader io.ReadCloser) (*WARCFile, error)
Creates a new WARCFile input should be a handle to a gzipped WARC file
func (*WARCFile) GetReader ¶
func (wf *WARCFile) GetReader() *WARCReader
func (*WARCFile) ReadRecord ¶
func (wf *WARCFile) ReadRecord() (*WARCRecord, error)
type WARCHeader ¶
type WARCHeader struct { *utils.CIStringMap // contains filtered or unexported fields }
The WARC Header object represents the headers of a WARC record. It provides dictionary like interface for accessing the headers.
The following mandatory fields are accessible also as get/set methods.
- h.GetRecordId() == h.Get('WARC-Record-ID')
- h.GetContentLength() == h.Get("Content-Length") // converted to int
- h.GetDate() == h.Get("WARC-Date")
- h.GetType() == h.Get("WARC-Type")
:params headers: map[string]string of headers. :params defaults: If true, important headers like WARC-Record-ID,
WARC-Date, Content-Type and Content-Length are initialized to automatically if not already present. TODO: add this param back for read/write
func NewWARCHeader ¶
func NewWARCHeader(headers map[string]string) *WARCHeader
TODO: restore 'defaults' arg for read/write
func (*WARCHeader) GetContentLength ¶
func (wh *WARCHeader) GetContentLength() int
The Content-Length header as int.
func (*WARCHeader) GetRecordId ¶
func (wh *WARCHeader) GetRecordId() string
The value of WARC-Record-ID header.
func (*WARCHeader) String ¶
func (wh *WARCHeader) String() string
func (*WARCHeader) WriteTo ¶
func (wh *WARCHeader) WriteTo(f io.Writer)
Writes this header to a file, in the format specified by WARC.
type WARCReader ¶
type WARCReader struct {
// contains filtered or unexported fields
}
func NewWARCReader ¶
func NewWARCReader(filehandle io.Reader, gzipfile *gzip.Reader) *WARCReader
func (*WARCReader) Iterate ¶
func (wr *WARCReader) Iterate(callback func(*WARCRecord, error))
func (*WARCReader) ReadHeader ¶
func (wr *WARCReader) ReadHeader(reader *bufio.Reader) (*WARCHeader, error)
func (*WARCReader) ReadRecord ¶
func (wr *WARCReader) ReadRecord() (*WARCRecord, error)
type WARCRecord ¶
type WARCRecord struct {
// contains filtered or unexported fields
}
The WARCRecord object represents a WARC Record.
func NewWARCRecord ¶
func NewWARCRecord(header *WARCHeader, payload *utils.FilePart, headers map[string]string) *WARCRecord
Creates a new WARC record.
func (*WARCRecord) GetChecksum ¶
func (wr *WARCRecord) GetChecksum() string
func (*WARCRecord) GetHeader ¶
func (wr *WARCRecord) GetHeader() *WARCHeader
func (*WARCRecord) GetIpAddress ¶
func (wr *WARCRecord) GetIpAddress() string
The IP address of the host contacted to retrieve the content of this record. This value is available from the WARC-IP-Address header.
func (*WARCRecord) GetPayload ¶
func (wr *WARCRecord) GetPayload() *utils.FilePart
func (*WARCRecord) GetUrl ¶
func (wr *WARCRecord) GetUrl() string
The value of the WARC-Target-URI header if the record is of type "response".
func (*WARCRecord) Offset ¶
func (wr *WARCRecord) Offset() int
Offset of this record in the warc file from which this record is read. TODO: not yet implemented. Currently hard-coded to -1
func (*WARCRecord) Set ¶
func (wr *WARCRecord) Set(name string, value string)