Documentation ¶
Index ¶
- func GetValue(kv *badger.DB, key string) int
- func GetValues(kv *badger.DB, keys []string) (map[string][]byte, error)
- func InitKeyVal(dir string) *badger.DB
- func NewDB(cnf config.Config) *sql.DB
- func NewDbGorm(cnf config.Config) *gorm.DB
- func QuoteString(s string) string
- func ResetKeyVal(dir string) error
- func RunQuery(d *sql.DB, q string) *sql.Rows
- func Truncate(d *sql.DB, tables []string) error
- type ColBhlRefs
- type ColNomenRef
- type Item
- type ItemStats
- type NameOccurrence
- type NameString
- type Page
- type Part
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func InitKeyVal ¶
func InitKeyVal(dir string) *badger.DB
InitBadger finds and initializes connection to a badger key-value store. If the store does not exist, InitBadger creates it.
func QuoteString ¶
QuoteString makes a string value compatible with SQL synthax by wrapping it in quotes and escaping internal quotes.
func ResetKeyVal ¶
Types ¶
type ColBhlRefs ¶ added in v0.1.1
type ColBhlRefs struct { // RecordID is the Catalogue of Life identifier of a name-string. RecordID string `gorm:"type:varchar(100);index:record_id_bhl"` // ItemID is automatically generated identifier from BHL database. // It corresponds to ID field in Item. ItemID uint // PartID is an automatically generated identifier from BHL database. PartID uint // PageID is the identifier autogenerated by BHL database. PageID uint // Odds calculated by Naive Bayes algorithm. We consider odds from 0.01 and // higher. // Here are the Odds of the best result. Odds float64 // Quality is the probability that a reference is 'real' // 0 - nothing is found // 1 - 15% (Odds > 0.01) // 2 - 50% (Odds > 0.1) // 3 - 80% (Odds > 1) // 4 - 98% (Odds > 10) Quality int }
type ColNomenRef ¶ added in v0.1.1
type ColNomenRef struct { // ID is automatically generated. ID uint `gorm:"primary_key"` // RecordID is the Catalogue of Life identifier of a name-string. RecordID string `gorm:"type:varchar(100);primary_key;auto_increment:false"` // Name is the verbatim name-string from the CoL. Name string `sql:"type:CHARACTER VARYING(255) COLLATE \"C\" NOT NULL"` // Ref is a nomenclatural reference from Catalogue of Life. Ref string // Kingdom is a kingdom name of the record. Kingdom string `gorm:"type:varchar(100)"` // Phylum is a phylum name of the record. Phylum string `gorm:"type:varchar(100)"` // Class is a class name of the record. Class string `gorm:"type:varchar(100)"` // Ordr is a order name of the record. Ordr string `gorm:"type:varchar(100)"` // Family is a family name of the record. Family string `gorm:"type:varchar(100);index"` // Genus is a genus name of the record. Genus string `gorm:"type:varchar(100);index"` // CanonicalSimple is a canonical form without hybrid signs, ranks etc. CanonicalSimple string `gorm:"type:varchar(255);index:canonical_simple" sql:"type:CHARACTER VARYING(255) COLLATE \"C\" NOT NULL"` // CanonicalStem is a canonical form after removal of suffixes and // substitution of some characters. CanonicalStem string `gorm:"type:varchar(255);index:canonical_stem" sql:"type:CHARACTER VARYING(255) COLLATE \"C\" NOT NULL"` // ItemID is automatically generated identifier from BHL database. // It corresponds to ID field in Item. ItemID uint // PartID is an automatically generated identifier from BHL database. PartID uint // PageID is the identifier autogenerated by BHL database. PageID uint // RefsNum is the number of found links to potential nomenclatural references. RefsNum uint // Odds calculated by Naive Bayes algorithm. We consider odds from 0.01 and // higher. // Here are the Odds of the best result. Odds float64 // Quality is the probability that a reference is 'real' // 0 - nothing is found // 1 - 15% (Odds > 0.01) // 2 - 50% (Odds > 0.1) // 3 - 80% (Odds > 1) // 4 - 98% (Odds > 10) Quality int `gorm:"index"` }
ColNomenRef contains
type Item ¶
type Item struct { // ID is the identifier autogenerated by BHL database. ID uint `gorm:"primary_key;auto_increment:false"` // Identifier generated by Internet Archive for the Item. BarCode string `gorm:"type:varchar(100);unique_index;not null"` // Vol contains not normalized volume field from BHl database. Vol string `gorm:"type:varchar(100)"` // YearStart contains the earliest year of publication. For journal volume // it would be a publication of the first journal issue, for a book it // would be the date of publication. YearStart sql.NullInt32 // YearEnd contains the latest year of publication. The field is often // empty, if the Item was published at once. YearEnd sql.NullInt32 // TitleID contains automatically generated id for the parent title of the // item. TitleID uint `gorm:"not null"` // TitleDOI is the DOI of an item. TitleDOI string `gorm:"type:varchar(100)"` // TitleName is the name of a journal or a book. TitleName string `gorm:"type:varchar(255)"` // TitleAbbr1 is an acronym of a title where the first letter of each // word is used. TitleAbbr1 string `gorm:"type:varchar(10)"` // TitleAbbr2 is an acronym of a title where 'common' words like 'and' // 'the' etc. are ommitted. TitleAbbr2 string `gorm:"type:varchar(10)"` // TitleYearStart the first year when a title was published. TitleYearStart sql.NullInt32 // TitleYearEnd is the last year when a title was published. TitleYearEnd sql.NullInt32 // TitleLang is the most prevalent language of a title. TitleLang string `gorm:"type:varchar(20)"` }
Item is a physical entity digitized and aggregated by Internet Archive and BHL. It can be a volume of a journal, a book etc.
type ItemStats ¶ added in v0.1.0
type ItemStats struct { // ID is the Item identifier autogenerated by BHL database. ID uint `gorm:"primary_key;auto_increment:false"` // NamesTotal is the number of unique names in the item (rank genus and // lower) verified by the Catalogue of Life and used in statistics // calculations. NamesTotal uint `gorm:"not null"` // MainTaxon is the taxon containing more that 50% of all taxa in the item. MainTaxon string `gorm:"type:varchar(100)"` // MainTaxonRank is the rank of the MainTaxon. MainTaxonRank string `gorm:"type:varchar(100)"` // MainTaxonPercent is the percentage of taxa belonging to the MainTaxon. MainTaxonPercent uint `gorm:"type:varchar(100)"` // MainKingdom is the kingdom that contains most of the taxa in the item. MainKingdom string `gorm:"type:varchar(100)"` // MainKingdomPercent is the percentage of taxa associated with the // MainKingdom. MainKingdomPercent uint // AnimaliaNum is the number of unique names in the item associated with // Animalia by the Catalogue of Life. AnimaliaNum uint `gorm:"not null"` // PlantaeNum is the number of unique names in the item associated with // Plantae by the Catalogue of Life. PlantaeNum uint `gorm:"not null"` // FungiNum is the number of unique names in the item associated with Fungi // by the Catalogue of Life. FungiNum uint `gorm:"not null"` // BacteriaNum is the number of unique names in the item associated with // Bacteria by the Catalogue of Life. BacteriaNum uint `gorm:"not null"` // MainPhylum is the phylum that contains most of the taxa in the item. MainPhylum string `gorm:"type:varchar(100)"` // MainPhylumPercent is the percentage of taxa associated with the // MainPhylum. MainPhylumPercent uint // MainClass is the class that contains most of the taxa in the item. MainClass string `gorm:"type:varchar(100)"` // MainClassPercent is the percentage of taxa associated with the // MainClass. MainClassPercent uint // MainOrder is the order that contains most of the taxa in the item. MainOrder string `gorm:"type:varchar(100)"` // MainOrderPercent is the percentage of taxa associated with the // MainOrder. MainOrderPercent uint // MainFamily is the family that contains most of the taxa in the item. MainFamily string `gorm:"type:varchar(100)"` // MainFamilyPercent is the percentage of taxa associated with the // MainFamily. MainFamilyPercent uint // MainGenus is the family that contains most of the taxa in the item. MainGenus string `gorm:"type:varchar(100)"` // MainGenusPercent is the percentage of taxa associated with the // MainGenus. MainGenusPercent uint }
ItemStats contains taxonomical statistics for items.
type NameOccurrence ¶ added in v0.1.0
type NameOccurrence struct { // PageID corresponds to ID field in Page. It is a number automatically // generated by BHL database. PageID uint // NameStringID corresponds to ID field in NameString. // It is UUID v5 generated from the normalized version of // a detected name. NameStringID string `sql:"type:uuid;index:name_string"` // OffsetStart is the starting position of a detected name on the page. // It is calculated using UTF-8 characters. OffsetStart uint // OffsetEnd is the ending position of a detected name on the page. // It is calculated using UTF-8 characters. OffsetEnd uint // OddsLog10 is a logarithm with base 10 of odds that a detected string is // actually a scientific name according to a Naive Bayes algorithm. OddsLog10 float64 // AnnotNomen is a normalized nomenclatural annotation detected in a vicinity // of the occurrence. Examples of annotations are `NO_ANNOT`, `SP_NOV` etc. AnnotNomen string `gorm:"type:varchar(50);index:annot"` }
NameOccurrence is the occurrence of a name-string in BHL.
type NameString ¶
type NameString struct { // ID is UUID v5 generated from the Name field. There is always // 1:1 relationship between Name and ID. ID string `gorm:"type:uuid;primary_key"` // Name is the normalized version of detected in BHL name. Name string `sql:"type:CHARACTER VARYING(255) COLLATE \"C\" NOT NULL"` // RecordID is the Catalogue of Life identifier of a matched taxon. RecordID string `gorm:"type:varchar(100);index:record_id"` // MatchType describes a resulting kind of a name-string match. // The following match types are possible: // // NoMatch - GNverifier did not find a match for the name-string. // Exact - Canonical form of a name matched exactly // PartialExact - Canonical form matched exactly after removal of some words. // Fuzzy - Canonical form matched, but with some differences. // PartialFuzzy - Canonical form matched with differences after removal of some words. // Virus - Name-string matched as a virus name. MatchType string `gorm:"type:varchar(100)"` // EditDistance shows how much difference exists between name-string and a // match according to Levenshtein algorithm. EditDistance uint // StemEditDistance shows how much difference exists between name-string and // a match according to Levenshtein algorithm. StemEditDistance uint // MatchedName provides the complete complete name-string. MatchedName string `gorm:"type:varchar(255)"` // MatchedCanonical provides canonical form of the matched name-string. MatchedCanonical string `gorm:"type:varchar(255);index:canonical" sql:"type:CHARACTER VARYING(255) COLLATE \"C\" NOT NULL"` // CurrentName is the full currently accepted name of the match // provided by the DataSource. CurrentName string `gorm:"type:varchar(255)"` // CurrentCanonical is a canonical form of the currently accepted name of // the match. CurrentCanonical string `gorm:"type:varchar(255);index:current_canonical" sql:"type:CHARACTER VARYING(255) COLLATE \"C\" NOT NULL"` // Classification contains a classification to the name provided by the // Catalogue of Life. Classification string // ClassificationRanks provides ranks information for classification path. ClassificationRanks string // ClassificationIDs provides RankIDs for classification path. ClassificationIDs string // DataSourceID is the ID of the data-source according to GNverifier. // The mapping of IDs to data-sources can be found at // https://verifier.globalnames.org/data_sources // site. In this case it should always be 1. DataSourceId sql.NullInt32 // DataSourceTitle provides a title of the data-source that matched the // name-string. In this case it should always be `The Catalogue of Life`. DataSourceTitle string `gorm:"type:varchar(255)"` // DataSourcesNumber is the number of dataSources that matched the name. DataSourcesNumber uint // Curation provides information about a level of curation according to // GNverifier. The following categories are supported: // // NotCurated -- None of data-sources that matched a name-string are marked as curated. // Curated -- Some data-sources with a match are marked as curated. // AutoCurated -- Some data-sources have automatic quality control, but not much human curation. Curation bool `gorm:"index:curation"` // Occurrences is the number of times this name appeared in BHL texts. Occurences uint // OddsLog10 is a logarithm with base 10 of odds that a detected string is // actually a scientific name according to a Naive Bayes algorithm. OddsLog10 float32 // Error contains error that happened during verification. If this field // is empty then verification was completed successfully for the name-string. Error string `gorm:"type:varchar(255)"` }
NameString is a unique normalize name-string that had been matched, at least partially to the Catalogue of Life.
type Page ¶
type Page struct { // ID is the identifier autogenerated by BHL database. ID uint `gorm:"primary_key;auto_increment:false"` // ItemID is automatically generated identifier from BHL database. // It corresponds to ID field in Item. ItemID uint `gorm:"index:item;not null"` // SequenceOrder corresponds to ordered position of a page in an item. // For example a an item page that is preceded by 4 other pages should // have SequenceOrder 5. SequenceOrder uint `gorm:"not null"` // PageNum corresponds to the page number/label assigned by the publisher // of the item. PageNum sql.NullInt64 }
Page contains metadata about a page file from BHL archive.
type Part ¶
type Part struct { // ID is an automatically generated identifier from BHL database. ID uint `gorm:"primary_key;auto_increment:false"` // PageID is an automatically generated identifier for a page. It comes // from BHL database. PageID sql.NullInt32 // ItemID is an automatically generated identifier for an item. It comes // from BHL database. ItemID sql.NullInt32 // Length is the length of a part in pages. Length sql.NullInt32 // DOI is a DOI assigned to the part. DOI string `gorm:"type:varchar(100)"` // ContributorName is a name of a project/person which provided information // about a part. ContributorName string `gorm:"type:varchar(255)"` // SequenceOrder is a sequencial position of a part in the item. For // example the second scientific paper in a journal will have a // the SequenceOrder 2. SequenceOrder sql.NullInt32 // SegmentType describe a type of a part. For example chapter, article, etc. SegmentType string `gorm:"type:varchar(100)"` // Title is the title of the part. Title string `gorm:"type:text"` // ContainerTitle is a title of a parent unit (items title?). ContainerTitle string `gorm:"type:text"` // PublicationDetails describes information about publisher. PublicationDetails string `gorm:"type:text"` // Volume is the volume of a citation. Volume string `gorm:"type:varchar(100)"` // Series is series of a citation. Series string `gorm:"type:varchar(100)"` // Issue is an issue of a citation. Issue string `gorm:"type:varchar(100)"` // Date is the date of the part publication. Date string `gorm:"type:varchar(100)"` // Year is the year of a part. Year sql.NullInt32 `gorm:"index:year"` // YearEnd is the year when a part finished its publication. YearEnd sql.NullInt32 // Month is the month when a part was published. Month sql.NullInt32 // Day is the day when a part was published. Day sql.NullInt32 // PageNumStart is the page number where a part starts. PageNumStart sql.NullInt32 // PageNumEnd is the page number where a part ends. PageNumEnd sql.NullInt32 // Language is the prevalent language of a part. Language string `gorm:"type:varchar(20)"` }
Part is a distinct part of an item. It can be a chapter, an article, a scientific paper.