Documentation ¶
Index ¶
- Constants
- Variables
- func ConvertArticleInCategories(logger zerolog.Logger, namespace uuid.UUID, mnemonicPrefix, id string, ...) errors.E
- func ConvertArticleRedirects(logger zerolog.Logger, namespace uuid.UUID, id string, ...) errors.E
- func ConvertArticleUsedTemplates(logger zerolog.Logger, namespace uuid.UUID, mnemonicPrefix, id string, ...) errors.E
- func ConvertCategoryDescription(id, from, html string, doc *peerdb.Document) errors.E
- func ConvertEntity(ctx context.Context, index string, logger zerolog.Logger, ...) (*peerdb.Document, errors.E)
- func ConvertFileDescription(namespace uuid.UUID, id, from, html string, doc *peerdb.Document) errors.E
- func ConvertPageInCategories(logger zerolog.Logger, namespace uuid.UUID, mnemonicPrefix, id string, ...) errors.E
- func ConvertPageRedirects(logger zerolog.Logger, namespace uuid.UUID, id string, page AllPagesPage, ...) errors.E
- func ConvertPageUsedTemplates(logger zerolog.Logger, namespace uuid.UUID, mnemonicPrefix, id string, ...) errors.E
- func ConvertTemplateDescription(id, from string, html string, doc *peerdb.Document) errors.E
- func ConvertWikimediaCommonsImage(ctx context.Context, logger zerolog.Logger, httpClient *retryablehttp.Client, ...) (*peerdb.Document, errors.E)
- func ConvertWikipediaArticle(id, html string, doc *peerdb.Document) errors.E
- func ConvertWikipediaImage(ctx context.Context, logger zerolog.Logger, httpClient *retryablehttp.Client, ...) (*peerdb.Document, errors.E)
- func ExtractArticle(input string) (string, *goquery.Document, errors.E)
- func ExtractArticleSummary(doc *goquery.Document) (string, errors.E)
- func ExtractCategoryDescription(input string) (string, errors.E)
- func ExtractFileDescriptions(input string) ([]string, errors.E)
- func ExtractTemplateDescription(input string) (string, errors.E)
- func FirstUpperCase(str string) string
- func GetMediawikiFilePrefix(filename string) string
- func GetPageHTML(ctx context.Context, httpClient *retryablehttp.Client, site, title string) (string, errors.E)
- func GetWikidataDocumentID(id string) identifier.Identifier
- func GetWikidataItem(ctx context.Context, index string, esClient *elastic.Client, id string) (*peerdb.Document, *elastic.SearchHit, errors.E)
- func GetWikimediaCommonsFile(ctx context.Context, index string, esClient *elastic.Client, name string) (*peerdb.Document, *elastic.SearchHit, errors.E)
- func GetWikipediaFile(ctx context.Context, index string, esClient *elastic.Client, name string) (*peerdb.Document, *elastic.SearchHit, errors.E)
- func ListAllPages(ctx context.Context, httpClient *retryablehttp.Client, namespaces []int, ...) errors.E
- func SetPageID(namespace uuid.UUID, mnemonicPrefix string, id string, pageID int64, ...) errors.E
- func UpdateEmbeddedDocuments(ctx context.Context, index string, logger zerolog.Logger, ...) (bool, errors.E)
- type AllPagesPage
- type Image
- type ImageInfo
- type PageReference
Constants ¶
const ( WikidataReference = "Wikidata" WikimediaCommonsEntityReference = "CommonsEntity" WikimediaCommonsFileReference = "CommonsFile" WikipediaCategoryReference = "WikipediaCategory" WikipediaTemplateReference = "WikipediaTemplate" WikimediaCommonsCategoryReference = "CommonsCategory" WikimediaCommonsTemplateReference = "CommonsTemplate" )
const (
// All pages API has this limit and it does not depend on the token used.
APILimit = 500
)
Variables ¶
var ( ErrSkipped = errors.Base("skipped") ErrSilentSkipped = errors.BaseWrap(ErrSkipped, "silent skipped") )
var ( NameSpaceWikidata = uuid.MustParse("8f8ba777-bcce-4e45-8dd4-a328e6722c82") ErrNotFound = errors.Base("not found") )
var ( //nolint:gochecknoglobals NameSpaceWikipediaFile = uuid.MustParse("94b1c372-bc28-454c-a45a-2e4d29d15146") ErrWikimediaCommonsFile = errors.Base("file is from Wikimedia Commons error") )
var (
NameSpaceWikimediaCommonsFile = uuid.MustParse("31974ea8-ab0c-466d-9aaa-e1bf3c959edc")
)
Functions ¶
func ConvertArticleInCategories ¶
func ConvertArticleInCategories(logger zerolog.Logger, namespace uuid.UUID, mnemonicPrefix, id string, article mediawiki.Article, doc *peerdb.Document) errors.E
TODO: How to remove categories which has previously been added but are later on removed?
func ConvertArticleRedirects ¶
func ConvertArticleRedirects(logger zerolog.Logger, namespace uuid.UUID, id string, article mediawiki.Article, doc *peerdb.Document) errors.E
TODO: How to remove redirects which has previously been added but are later on removed?
func ConvertArticleUsedTemplates ¶
func ConvertArticleUsedTemplates(logger zerolog.Logger, namespace uuid.UUID, mnemonicPrefix, id string, article mediawiki.Article, doc *peerdb.Document) errors.E
TODO: How to remove templates which has previously been added but are later on removed?
func ConvertEntity ¶
func ConvertEntity( ctx context.Context, index string, logger zerolog.Logger, esClient *elastic.Client, cache *es.Cache, namespace uuid.UUID, entity mediawiki.Entity, ) (*peerdb.Document, errors.E)
ConvertEntity converts both Wikidata entities and Wikimedia Commons entities. Entities can reference only Wikimedia Commons files and not Wikipedia files.
func ConvertFileDescription ¶
func ConvertPageInCategories ¶
func ConvertPageInCategories(logger zerolog.Logger, namespace uuid.UUID, mnemonicPrefix, id string, page AllPagesPage, doc *peerdb.Document) errors.E
TODO: How to remove categories which has previously been added but are later on removed?
func ConvertPageRedirects ¶
func ConvertPageRedirects(logger zerolog.Logger, namespace uuid.UUID, id string, page AllPagesPage, doc *peerdb.Document) errors.E
TODO: How to remove redirects which has previously been added but are later on removed?
func ConvertPageUsedTemplates ¶
func ConvertPageUsedTemplates(logger zerolog.Logger, namespace uuid.UUID, mnemonicPrefix, id string, page AllPagesPage, doc *peerdb.Document) errors.E
TODO: How to remove templates which has previously been added but are later on removed?
func ConvertWikipediaArticle ¶
TODO: Store the revision, license, and source used for the HTML into a meta claim. TODO: Investigate how to make use of additional entities metadata. See: https://www.mediawiki.org/wiki/Topic:Wotwu75akwx2wnsb TODO: Make internal links to other articles work in HTML (link to PeerDB documents instead). TODO: Remove links to other articles which do not exist, if there are any. TODO: Clean custom tags and attributes used in HTML to add metadata into HTML, potentially extract and store that. See: https://www.mediawiki.org/wiki/Specs/HTML/2.4.0 TODO: Remove some templates (e.g., infobox, top-level notices) and convert them to claims. TODO: Extract all links pointing out of the article into claims and reverse claims (so if they point to other documents, they should have backlink as claim).
func ConvertWikipediaImage ¶
func ExtractArticleSummary ¶
ExtractArticleSummary should be called on the output of ExtractArticle.
func FirstUpperCase ¶
Implementation changes case only of ASCII characters. Using unicode.ToUpper sometimes changes case of characters for which Mediawiki does not change it. If we do change case when Mediawiki does not a corresponding file is not found. On the other hand, if we do not change case when Mediawiki does, then API returns a "normalized" field which fails JSON decoding so we detect such cases, if and when they happen. See: https://phabricator.wikimedia.org/T301758
func GetMediawikiFilePrefix ¶
func GetPageHTML ¶
func GetWikidataDocumentID ¶
func GetWikidataDocumentID(id string) identifier.Identifier
func GetWikidataItem ¶
func GetWikimediaCommonsFile ¶
func GetWikipediaFile ¶
func ListAllPages ¶
Types ¶
type AllPagesPage ¶
type AllPagesPage struct { Identifier int64 `json:"pageid"` Namespace int `json:"ns"` Title string `json:"title"` Properties map[string]string `json:"pageprops"` Categories []PageReference `json:"categories,omitempty"` Templates []PageReference `json:"templates,omitempty"` Redirects []PageReference `json:"redirects,omitempty"` }
type Image ¶
type Image struct { Name string `json:"img_name"` Size int64 `json:"img_size"` Width int64 `json:"img_width"` Height int64 `json:"img_height"` Metadata map[string]interface{} `json:"-"` Bits int64 `json:"img_bits"` MediaType string `json:"img_media_type"` MajorMIME string `json:"img_major_mime"` MinorMIME string `json:"img_minor_mime"` DescriptionID int64 `json:"img_description_id"` ActorID int64 `json:"img_actor"` Timestamp time.Time `json:"-"` SHA1 string `json:"img_sha1"` }
func (*Image) UnmarshalJSON ¶
type ImageInfo ¶
type ImageInfo struct { Mime string `json:"mime"` Size int `json:"size"` Width int `json:"width"` Height int `json:"height"` PageCount int `json:"pagecount"` Duration float64 `json:"duration"` URL string `json:"url"` DescriptionURL string `json:"descriptionurl"` DescriptionShortURL string `json:"descriptionshorturl"` // Set if the requested page redirected to another page and info is from that other page. Redirect string `json:"-"` }