Documentation ¶
Index ¶
- Constants
- Variables
- func DateTimeToUTC(date string) time.Time
- func DownloadFile(url, filepath string) error
- func ParseDateFromFilePath(path string) time.Time
- func ParseMBoxMsg(Debug int, groupName string, msg []byte) (item map[string]interface{}, valid, warn bool)
- func TrimDots(s string) string
- func TrimFirstDash(s string) (year, month string)
- func TrimFirstDot(s string) string
- type AffiliationClient
- type ESClientProvider
- type EnrichedMessage
- type Enricher
- func (e *Enricher) EnrichMessage(rawMessage *RawMessage, now time.Time) (*EnrichedMessage, error)
- func (e *Enricher) FormatTimestampString(str string) (*time.Time, error)
- func (e *Enricher) GetEmailDomain(email string) string
- func (e *Enricher) GetEmailUsername(email string) string
- func (e *Enricher) GetUserName(rawMailString string) (username string)
- func (e *Enricher) HandleMapping(index string) error
- func (e *Enricher) HandleObfuscatedEmail(rawMailString string) (email string)
- func (e *Enricher) IsValidEmail(rawMailString string) bool
- func (e *Enricher) RemoveSpecialCharactersFromString(s string) (val *string)
- type Fetcher
- func (f *Fetcher) AddMetadata(msg interface{}, endpoint, slug, groupName string) *RawMessage
- func (f *Fetcher) ElasticRawMapping() []byte
- func (f *Fetcher) ElasticRichMapping() []byte
- func (f *Fetcher) Fetch(url string, fromDate *time.Time) (map[string]string, error)
- func (f *Fetcher) FetchItem(slug, groupName, endpoint string, fromDate time.Time, limit int, now time.Time) ([]*RawMessage, error)
- func (f *Fetcher) Find(slice []string, val string) (bool, string)
- func (f *Fetcher) GetLastDate(ESIndex string, now time.Time) (time.Time, error)
- func (f *Fetcher) HandleMapping(index string) error
- func (f *Fetcher) ItemCategory(item interface{}) string
- func (f *Fetcher) ItemID(item interface{}) string
- func (f *Fetcher) ItemUpdatedOn(item interface{}) time.Time
- func (f *Fetcher) ParseArchiveLinks(archivesURL string, fromDate *time.Time) ([]string, error)
- func (f *Fetcher) Query(index string, query map[string]interface{}) (*RawHits, error)
- type HTTPClientProvider
- type HitSource
- type Hits
- type Manager
- type MessageSearchFields
- type NHits
- type NestedHits
- type NestedRawHits
- type Params
- type RawHits
- type RawMessage
- type RawMessageData
- type TopHits
Constants ¶
const ( // ModMboxThreadStr ... ModMboxThreadStr = "/thread" // Pipermail datasource Pipermail = "pipermail" // PiperBackendVersion ... PiperBackendVersion = "0.0.1" // MessageDateField ... MessageDateField = "date" // Message ... Message = "message" // MessageIDField ... MessageIDField = "Message-ID" // Unknown ... Unknown = "Unknown" // MaxConcurrentRequests ... MaxConcurrentRequests = 100000 )
const ( // MessageReceivedField ... MessageReceivedField = "received" // MaxMessageBodyLength ... MaxMessageBodyLength = 1000 // DropXFields - drop fields starting with X- - to avoid ES 1000 fields limit DropXFields = true // MaxMessageProperties maximum properties that can be set on the message object MaxMessageProperties = 255 // ContentType - common constant string ContentType = "Content-Type" // LowerContentType - common constant string LowerContentType = "content-type" )
Variables ¶
var ( // CompressedTypes ... CompressedTypes = []string{".gz", ".bz2", ".zip", ".tar", ".tar.gz", ".tar.bz2", ".tgz", ".tbz"} // AcceptedTypes ... AcceptedTypes = []string{".mbox", ".txt"} // CombinedTypes ... CombinedTypes []string // MONTHS ... MONTHS = map[string]int{"January": 1, "February": 2, "March": 3, "April": 4, "May": 5, "June": 6, "July": 7, "August": 8, "September": 9, "October": 10, "November": 11, "December": 12} // DefaultDateTime ... DefaultDateTime = time.Date(1970, 1, 1, 0, 0, 0, 0, time.UTC) // MessageSeparator ... MessageSeparator = []byte("\nFrom") // PiperRawMapping ... PiperRawMapping = []byte(`{"mappings":{"dynamic":true,"properties":{"metadata__updated_on":{"type":"date"},"data":{"properties":{"body":{"dynamic":false,"properties":{}}}}}}}`) // PiperRichMapping ... PiperRichMapping = []byte(`{"mappings":{"dynamic_templates":[{"notanalyzed":{"match":"*","match_mapping_type":"string","mapping":{"type":"keyword"}}},{"int_to_float":{"match":"*","match_mapping_type":"long","mapping":{"type":"float"}}},{"formatdate":{"match":"*","match_mapping_type":"date","mapping":{"format":"strict_date_optional_time||epoch_millis","type":"date"}}}]}}`) // EmailObfuscationPatterns ... EmailObfuscationPatterns = []string{" at ", "_at_", " en "} // ArchiveDownloadsPath ... ArchiveDownloadsPath = strings.TrimSpace(os.Getenv("HOME") + "/.perceval/mailinglists/") // DumpsPath ... DumpsPath = strings.TrimSpace(os.Getenv("HOME") + "/.perceval/dumps/") )
var ( // LowerDayNames - lower case 3 letter US day names LowerDayNames = map[string]struct{}{ "mon": {}, "tue": {}, "wed": {}, "thu": {}, "fri": {}, "sat": {}, "sun": {}, } // LowerMonthNames - map lower month names LowerMonthNames = map[string]string{ "jan": "Jan", "feb": "Feb", "mar": "Mar", "apr": "Apr", "may": "May", "jun": "Jun", "jul": "Jul", "aug": "Aug", "sep": "Sep", "oct": "Oct", "nov": "Nov", "dec": "Dec", } // LowerFullMonthNames - map lower month names (full) LowerFullMonthNames = map[string]string{ "january": "Jan", "february": "Feb", "march": "Mar", "april": "Apr", "may": "May", "june": "Jun", "july": "Jul", "august": "Aug", "september": "Sep", "october": "Oct", "november": "Nov", "december": "Dec", } // SpacesRE - match 1 or more space characters SpacesRE = regexp.MustCompile(`\s+`) // TZOffsetRE - time zone offset that comes after +0... +1... -0... -1... // Can be 3 or 3 digits then whitespace and then anything TZOffsetRE = regexp.MustCompile(`^(\d{3})(\s+.*$|$)`) // MessageLineSeparator - message line separator MessageLineSeparator = []byte("\n") )
var ( // PipermailRawMapping - Pipeermail raw index mapping PipermailRawMapping = []byte(`{"mappings": {"dynamic":true,"properties":{"metadata__updated_on":{"type":"date"},"data":{"properties":{"description":{"type":"text","index":true},"full_description":{"type":"text","index":true}}}}}}`) )
Functions ¶
func DownloadFile ¶
DownloadFile will download a url to a local file. It's efficient because it will write as it downloads and not load the whole file into memory.
func ParseDateFromFilePath ¶
ParseDateFromFilePath ...
Types ¶
type AffiliationClient ¶
type AffiliationClient interface { GetIdentityByUser(key string, value string) (*affiliation.AffIdentity, error) AddIdentity(identity *affiliation.Identity) bool GetOrganizations(uuid string, projectSlug string) *[]affiliation.Enrollment }
AffiliationClient manages user identity
type ESClientProvider ¶
type ESClientProvider interface { Add(index string, documentID string, body []byte) ([]byte, error) CreateIndex(index string, body []byte) ([]byte, error) Bulk(body []byte) ([]byte, error) Get(index string, query map[string]interface{}, result interface{}) (err error) GetStat(index string, field string, aggType string, mustConditions []map[string]interface{}, mustNotConditions []map[string]interface{}) (result time.Time, err error) BulkInsert(data []elastic.BulkData) ([]byte, error) }
ESClientProvider used in connecting to ES Client server
type EnrichedMessage ¶
type EnrichedMessage struct { ID string `json:"id"` TZ float64 `json:"tz"` MessageID string `json:"Message-ID"` UUID string `json:"uuid"` AuthorName string `json:"author_name"` Root bool `json:"root"` AuthorOrgName string `json:"author_org_name"` AuthorBot bool `json:"author_bot"` BodyExtract string `json:"body_extract"` AuthorID string `json:"author_id"` SubjectAnalyzed string `json:"subject_analyzed"` Project string `json:"project"` MboxAuthorDomain string `json:"mbox_author_domain"` Date time.Time `json:"date"` IsPipermailMessage int `json:"is_pipermail_message"` List string `json:"list"` AuthorUUID string `json:"author_uuid"` AuthorMultiOrgNames []string `json:"author_multi_org_names"` Origin string `json:"origin"` Size int64 `json:"size"` Tag string `json:"tag"` Subject string `json:"subject"` FromID string `json:"from_id"` EmailDate time.Time `json:"email_date"` MetadataTimestamp time.Time `json:"metadata__timestamp"` MetadataBackendName string `json:"metadata__backend_name"` MetadataUpdatedOn time.Time `json:"metadata__updated_on"` MetadataEnrichedOn time.Time `json:"metadata__enriched_on"` ProjectSlug string `json:"project_slug"` ChangedAt time.Time `json:"changed_at"` GroupName string `json:"group_name"` Slug string `json:"slug"` References string `json:"references"` }
EnrichedMessage represents piper mail enriched message
type Enricher ¶
type Enricher struct { DSName string // Datasource will be used as key for ES ElasticSearchProvider ESClientProvider BackendVersion string // contains filtered or unexported fields }
Enricher contains pipermail datasource enrich logic
func NewEnricher ¶
func NewEnricher(backendVersion string, esClientProvider ESClientProvider, affiliationsClientProvider *affiliation.Affiliation) *Enricher
NewEnricher initiates a new Enricher
func (*Enricher) EnrichMessage ¶
func (e *Enricher) EnrichMessage(rawMessage *RawMessage, now time.Time) (*EnrichedMessage, error)
EnrichMessage enriches raw message
func (*Enricher) FormatTimestampString ¶
FormatTimestampString returns a formatted RFC 33339 Datetime string
func (*Enricher) GetEmailDomain ¶
GetEmailDomain ...
func (*Enricher) GetEmailUsername ¶
GetEmailUsername ...
func (*Enricher) GetUserName ¶
GetUserName ...
func (*Enricher) HandleMapping ¶
HandleMapping creates rich mapping
func (*Enricher) HandleObfuscatedEmail ¶
HandleObfuscatedEmail ...
func (*Enricher) IsValidEmail ¶
IsValidEmail validates email string
func (*Enricher) RemoveSpecialCharactersFromString ¶
RemoveSpecialCharactersFromString ...
type Fetcher ¶
type Fetcher struct { DSName string IncludeArchived bool HTTPClientProvider *http.ClientProvider ElasticSearchProvider *elastic.ClientProvider BackendVersion string Debug int DateFrom time.Time }
Fetcher contains piper mail datasource fetch logic
func NewFetcher ¶
func NewFetcher(params *Params, httpClientProvider *http.ClientProvider, esClientProvider *elastic.ClientProvider) *Fetcher
NewFetcher initiates a new pipermail fetcher
func (*Fetcher) AddMetadata ¶
func (f *Fetcher) AddMetadata(msg interface{}, endpoint, slug, groupName string) *RawMessage
AddMetadata - add metadata to the raw message
func (*Fetcher) ElasticRawMapping ¶
ElasticRawMapping - Raw index mapping definition
func (*Fetcher) ElasticRichMapping ¶
ElasticRichMapping - Rich index mapping definition
func (*Fetcher) Fetch ¶
Fetch the mbox files from the remote archiver.
Stores the archives in the path given during the initialization of this object. Those archives which don't have not valid extensions will be ignored.
Pipermail archives have on their file names the date of the archive is stored following the schema year-month. When fromDate property is called, it will return the mboxes for which their year and month are equal or after that date.
fromDate: fetch archives that store messages equal or after the given date; only year and month values are compared
returns a map of links and their paths of the fetched archives
func (*Fetcher) FetchItem ¶
func (f *Fetcher) FetchItem(slug, groupName, endpoint string, fromDate time.Time, limit int, now time.Time) ([]*RawMessage, error)
FetchItem extracts data from archives
func (*Fetcher) Find ¶
Find takes a slice and looks for an element in it. If found it will return it's true, otherwise it will return a bool of false.
func (*Fetcher) GetLastDate ¶
GetLastDate gets fetching lastDate
func (*Fetcher) HandleMapping ¶
HandleMapping updates piper mail raw mapping
func (*Fetcher) ItemCategory ¶
ItemCategory - return unique identifier for an item
func (*Fetcher) ItemUpdatedOn ¶
ItemUpdatedOn - return updated on date for an item
func (*Fetcher) ParseArchiveLinks ¶
ParseArchiveLinks scraps the contents of a given url to extract compressed files download links
type HTTPClientProvider ¶
type HTTPClientProvider interface {
Request(url string, method string, header map[string]string, body []byte, params map[string]string) (statusCode int, resBody []byte, err error)
}
HTTPClientProvider used in connecting to remote http server
type Manager ¶
type Manager struct { Endpoint string Slug string GroupName string SHConnString string FetcherBackendVersion string EnricherBackendVersion string Fetch bool Enrich bool ESUrl string ESUsername string ESPassword string ESIndex string FromDate *time.Time HTTPTimeout time.Duration Project string FetchSize int EnrichSize int AffBaseURL string ESCacheURL string ESCacheUsername string ESCachePassword string AuthGrantType string AuthClientID string AuthClientSecret string AuthAudience string Auth0URL string Environment string WebHookURL string MaxWorkers int NumberOfRawMessages int // contains filtered or unexported fields }
Manager describes piper mail manager
func NewManager ¶
func NewManager(endPoint, slug, shConnStr, fetcherBackendVersion, enricherBackendVersion string, fetch bool, enrich bool, eSUrl string, esUser string, esPassword string, esIndex string, fromDate *time.Time, project string, fetchSize int, enrichSize int, affBaseURL, esCacheURL, esCacheUsername, esCachePassword, authGrantType, authClientID, authClientSecret, authAudience, auth0URL, env, webHookURL string) (*Manager, error)
NewManager initiates piper mail manager instance
type MessageSearchFields ¶
MessageSearchFields ...
type NestedHits ¶
NestedHits is the actual hit data
type NestedRawHits ¶
type NestedRawHits struct { ID string `json:"_id"` Source RawMessage `json:"_source"` }
NestedRawHits is the actual hit data
type Params ¶
type Params struct { FromDate time.Time BackendVersion string Project string Debug int ProjectSlug string GroupName string }
Params required parameters for piper mail fetcher
type RawMessage ¶
type RawMessage struct { BackendVersion string `json:"backend_version"` Data *RawMessageData `json:"data"` Tag string `json:"tag"` UUID string `json:"uuid"` SearchFields *MessageSearchFields `json:"search_fields"` Origin string `json:"origin"` UpdatedOn float64 `json:"updated_on"` MetadataUpdatedOn time.Time `json:"metadata__updated_on"` BackendName string `json:"backend_name"` MetadataTimestamp time.Time `json:"metadata__timestamp"` Timestamp float64 `json:"timestamp"` Category string `json:"category"` ProjectSlug string `json:"project_slug"` GroupName string `json:"group_name"` Project string `json:"project"` ChangedAt time.Time `json:"changed_at"` }
RawMessage represents piper mail raw message
type RawMessageData ¶
type RawMessageData struct { ContentType string `json:"Content-Type"` Date string `json:"Date"` From string `json:"From"` InReplyTo string `json:"In-Reply-To"` MboxByteLength int64 `json:"MBox-Bytes-Length"` MboxNBodies int `json:"MBox-N-Bodies"` MboxNLines int64 `json:"MBox-N-Lines"` MboxProjectName string `json:"MBox-Project-Name"` MboxValid bool `json:"MBox-Valid"` MboxWarn bool `json:"MBox-Warn"` MessageID string `json:"Message-ID"` References string `json:"References"` Subject string `json:"Subject"` Data struct { Text struct { Plain []struct { Data string `json:"data"` } `json:"plain"` } `json:"text"` } `json:"data"` DateInTZ string `json:"date_in_tz"` DateTZ float64 `json:"date_tz"` }
RawMessageData ...