Documentation ¶
Overview ¶
Package seafan is a set of tools for building DNN models. The build engine is gorgonia (https://pkg.go.dev/gorgonia.org/gorgonia).
Seafan features:
- A data pipeline based on chutils (https://github.com/invertedv/chutils) to access files and ClickHouse tables.
- Point-and-shoot specification of the data
- Simple specification of one-hot features
- A wrapper around gorgonia that meshes to the pipeline.
- Simple specification of models, including embeddings
- A fit method with optional early stopping and callbacks
- Saving and loading models
- Model diagnostics for categorical targets.
- KS plots
- Decile plots
- Utilities.
- Plotting wrapper for plotly (https://github.com/MetalBlueberry/go-plotly) for xy plots.
- Numeric struct for (x,y) data and plotting and descriptive statistics.
Index ¶
- Variables
- func AddFitted(pipeIn Pipeline, nnFile string, target []int, name string, fts FTypes) error
- func AnyLess(x, y any) (bool, error)
- func CrossEntropy(model NNet) (cost *G.Node)
- func Decile(xy *XY, plt *PlotDef) error
- func GetNode(ns G.Nodes, name string) *G.Node
- func KS(xy *XY, plt *PlotDef) (ks float64, notTarget *Desc, target *Desc, err error)
- func LeakyReluAct(n *G.Node, alpha float64) *G.Node
- func LinearAct(n *G.Node) *G.Node
- func Marginal(nnFile string, feat string, target []int, pipe Pipeline, pd *PlotDef) error
- func Max(a, b int) int
- func Min(a, b int) int
- func Plotter(fig *grob.Fig, lay *grob.Layout, pd *PlotDef) error
- func RMS(model NNet) (cost *G.Node)
- func ReluAct(n *G.Node) *G.Node
- func SigmoidAct(n *G.Node) *G.Node
- func SoftMaxAct(n *G.Node) *G.Node
- func SoftRMS(model NNet) (cost *G.Node)
- func Strip(s string) (left, inner string, err error)
- func Unique(xs []any) []any
- func Wrapper(e error, text string) error
- type Activation
- type Args
- type ChData
- func (ch *ChData) Batch(inputs G.Nodes) bool
- func (ch *ChData) BatchSize() int
- func (ch *ChData) Cols(field string) int
- func (ch *ChData) Describe(field string, topK int) string
- func (ch *ChData) Epoch(setTo int) int
- func (ch *ChData) FieldList() []string
- func (ch *ChData) GData() *GData
- func (ch *ChData) Get(field string) *GDatum
- func (ch *ChData) GetFType(field string) *FType
- func (ch *ChData) GetFTypes() FTypes
- func (ch *ChData) Init() (err error)
- func (ch *ChData) IsCat(field string) bool
- func (ch *ChData) IsCts(field string) bool
- func (ch *ChData) IsNormalized(field string) bool
- func (ch *ChData) IsSorted() bool
- func (ch *ChData) Name() string
- func (ch *ChData) Rows() int
- func (ch *ChData) SaveFTypes(fileName string) error
- func (ch *ChData) Shuffle()
- func (ch *ChData) Slice(sl Slicer) (Pipeline, error)
- func (ch *ChData) Sort(field string, ascending bool) error
- func (ch *ChData) SortField() string
- func (ch *ChData) String() string
- type CostFunc
- type DOLayer
- type Desc
- type FCLayer
- type FParam
- type FRole
- type FType
- type FTypes
- type Fit
- type FitOpts
- type GData
- func (gd *GData) AppendC(raw *Raw, name string, normalize bool, fp *FParam) error
- func (gd *GData) AppendD(raw *Raw, name string, fp *FParam) error
- func (gd *GData) Drop(field string)
- func (gd *GData) FieldCount() int
- func (gd *GData) FieldList() []string
- func (gd *GData) Get(name string) *GDatum
- func (gd *GData) GetRaw(field string) (*Raw, error)
- func (gd *GData) IsSorted() bool
- func (gd *GData) Len() int
- func (gd *GData) Less(i, j int) bool
- func (gd *GData) MakeOneHot(from, name string) error
- func (gd *GData) Rows() int
- func (gd *GData) Shuffle()
- func (gd *GData) Slice(sl Slicer) (*GData, error)
- func (gd *GData) Sort(field string, ascending bool) error
- func (gd *GData) SortField() string
- func (gd *GData) Swap(i, j int)
- func (gd *GData) UpdateFts(newFts FTypes) (*GData, error)
- type GDatum
- type Layer
- type Levels
- type ModSpec
- func (m ModSpec) Check() error
- func (m ModSpec) DropOut(loc int) *DOLayer
- func (m ModSpec) FC(loc int) *FCLayer
- func (m ModSpec) Inputs(p Pipeline) (FTypes, error)
- func (m ModSpec) LType(i int) (*Layer, error)
- func (m ModSpec) Save(fileName string) (err error)
- func (m ModSpec) Target(p Pipeline) (*FType, error)
- type NNModel
- func LoadNN(fileRoot string, p Pipeline, build bool) (nn *NNModel, err error)
- func NewNNModel(modSpec ModSpec, pipe Pipeline, build bool, no ...NNOpts) (*NNModel, error)
- func PredictNN(fileRoot string, pipe Pipeline, build bool, opts ...NNOpts) (nn *NNModel, err error)
- func PredictNNwFts(fileRoot string, pipe Pipeline, build bool, fts FTypes, opts ...NNOpts) (nn *NNModel, err error)
- func (m *NNModel) Cost() *G.Node
- func (m *NNModel) CostFlt() float64
- func (m *NNModel) CostFn() CostFunc
- func (m *NNModel) Features() G.Nodes
- func (m *NNModel) FitSlice() []float64
- func (m *NNModel) Fitted() G.Result
- func (m *NNModel) Fwd()
- func (m *NNModel) G() *G.ExprGraph
- func (m *NNModel) InputFT() FTypes
- func (m *NNModel) Inputs() G.Nodes
- func (m *NNModel) Name() string
- func (m *NNModel) Obs() *G.Node
- func (m *NNModel) ObsSlice() []float64
- func (m *NNModel) OutputCols() int
- func (m *NNModel) Params() G.Nodes
- func (m *NNModel) Save(fileRoot string) (err error)
- func (m *NNModel) String() string
- type NNOpts
- type NNet
- type Opts
- type Pipeline
- type PlotDef
- type Raw
- type SeaError
- type Slice
- type Slicer
- type Summary
- type VecData
- func (vec *VecData) Batch(inputs G.Nodes) bool
- func (vec *VecData) BatchSize() int
- func (vec *VecData) Cols(field string) int
- func (vec *VecData) Describe(field string, topK int) string
- func (vec *VecData) Epoch(setTo int) int
- func (vec *VecData) FieldList() []string
- func (vec *VecData) GData() *GData
- func (vec *VecData) Get(field string) *GDatum
- func (vec *VecData) GetFType(field string) *FType
- func (vec *VecData) GetFTypes() FTypes
- func (vec *VecData) Init() error
- func (vec *VecData) IsCat(field string) bool
- func (vec *VecData) IsCts(field string) bool
- func (vec *VecData) IsNormalized(field string) bool
- func (vec *VecData) IsSorted() bool
- func (vec *VecData) Name() string
- func (vec *VecData) Rows() int
- func (vec *VecData) SaveFTypes(fileName string) error
- func (vec *VecData) Shuffle()
- func (vec *VecData) Slice(sl Slicer) (Pipeline, error)
- func (vec *VecData) Sort(field string, ascending bool) error
- func (vec *VecData) SortField() string
- func (vec *VecData) String() string
- type XY
Examples ¶
Constants ¶
This section is empty.
Variables ¶
var Browser = "firefox"
Browser is the browser to use for plotting.
var Verbose = true
Verbose controls amount of printing.
Functions ¶
func AddFitted ¶
AddFitted addes fitted values to a Pipeline pipeIn -- input Pipeline to run the model on nnFile -- root directory of NNModel target -- target columns of the model output to coalesce name -- name of fitted value in Pipeline fts -- options FTypes to use for normalizing pipeIn
func Decile ¶
Decile generates a decile plot of a softmax model that is reduced to a binary outcome.
y observed multinomial values fit fitted softmax probabilities trg columns of y to be grouped into a single outcome. The complement is reduced to the alternate outcome. logodds if true, fit is in log odds space plt PlotDef plot options. If plt is nil an error is generated.
Target: html plot file and/or plot in browser.
func KS ¶
KS finds the KS of a softmax model that is reduced to a binary outcome.
y observed multinomial values fit fitted softmax probabilities trg columns of y to be grouped into a single outcome. The complement is reduced to the alternate outcome. logodds if true, fit is in log odds space plt PlotDef plot options. If plt is nil, no plot is produced.
The ks statistic is returned as are Desc descriptions of the model for the two groups. Returns
ks KS statistic notTarget Desc struct of fitted values of the non-target outcomes target Desc struct of fitted values of target outcomes
Target: html plot file and/or plot in browser.
func LeakyReluAct ¶
LeakyReluAct is leaky relu activation
func Marginal ¶
Marginal produces a set of plots to aid in understanding the effect of a feature. The plot takes the model output and creates four segments based on the quartiles of the model output. For each segment, the feature being analyzed various across its range within the quartile (continuous) its values (discrete). The bottom row shows the distribution of the feature within the quartile range.
func Plotter ¶
Plotter plots the Plotly Figure fig with Layout lay. The layout is augmented by features I commonly use.
fig plotly figure lay plotly layout (nil is OK) pd PlotDef structure with plot options.
lay can be initialized with any additional layout options needed.
func SoftMaxAct ¶
SoftMaxAct implements softmax activation functin
Types ¶
type Activation ¶
type Activation int
Activation types
const ( Linear Activation = 0 + iota Relu LeakyRelu Sigmoid SoftMax )
func StrAct ¶
func StrAct(s string) (*Activation, float64)
StrAct takes a string and returns corresponding Activation and any parameter. Nil if fails.
func (Activation) String ¶
func (i Activation) String() string
type Args ¶
Args map holds layer arguments in key/val style
type ChData ¶
type ChData struct {
// contains filtered or unexported fields
}
ChData provides a Pipeline interface into text files (delimited, fixed length) and ClickHouse.
func (*ChData) Batch ¶
Batch loads a batch into inputs. It returns false if the epoch is done. If cycle is true, it will start at the beginning on the next call. If cycle is false, it will call Init() at the next call to Batch()
Example ¶
dataPath := os.Getenv("data") // path to data directory fileName := dataPath + "/test1.csv" f, e := os.Open(fileName) if e != nil { panic(e) } // set up chutils file reader rdr := file.NewReader(fileName, ',', '\n', 0, 0, 1, 0, f, 0) e = rdr.Init("", chutils.MergeTree) if e != nil { panic(e) } // determine data types e = rdr.TableSpec().Impute(rdr, 0, .99) if e != nil { panic(e) } bSize := 100 ch := NewChData("Test ch Pipeline", WithBatchSize(bSize), WithReader(rdr), WithNormalized("x1")) // create a graph & node to illustrate Batch() g := G.NewGraph() node := G.NewTensor(g, G.Float64, 2, G.WithName("x1"), G.WithShape(bSize, 1), G.WithInit(G.Zeroes())) var sumX = 0.0 n := 0 // run through batchs and verify counts and mean of x1 is zero for ch.Batch(G.Nodes{node}) { n += bSize x := node.Value().Data().([]float64) for _, xv := range x { sumX += xv } } mean := sumX / float64(n) fmt.Printf("mean of x1: %0.2f", math.Abs(mean)) // Target: // rows read: 8500 // mean of x1: 0.00
Output:
Example (Example2) ¶
// We can normalize fields by values we supply rather than the values in the epoch. dataPath := os.Getenv("data") // path to data directory fileName := dataPath + "/test1.csv" f, e := os.Open(fileName) if e != nil { panic(e) } // set up chutils file reader rdr := file.NewReader(fileName, ',', '\n', 0, 0, 1, 0, f, 0) e = rdr.Init("", chutils.MergeTree) if e != nil { panic(e) } // determine data types e = rdr.TableSpec().Impute(rdr, 0, .99) if e != nil { panic(e) } bSize := 100 // Let's normalize x1 with location=41 and scale=1 ft := &FType{ Name: "x1", Role: 0, Cats: 0, EmbCols: 0, Normalized: true, From: "", FP: &FParam{Location: 40, Scale: 1}, } ch := NewChData("Test ch Pipeline", WithBatchSize(bSize), WithReader(rdr)) WithFtypes(FTypes{ft})(ch) // create a graph & node to illustrate Batch() g := G.NewGraph() node := G.NewTensor(g, G.Float64, 2, G.WithName("x1"), G.WithShape(bSize, 1), G.WithInit(G.Zeroes())) sumX := 0.0 n := 0 // run through batchs and verify counts and mean of x1 is zero for ch.Batch(G.Nodes{node}) { n += bSize x := node.Value().Data().([]float64) for _, xv := range x { sumX += xv } } mean := sumX / float64(n) fmt.Printf("mean of x1: %0.2f", math.Abs(mean)) // Target: // rows read: 8500 // mean of x1: 39.50
Output:
func (*ChData) Describe ¶
Describe describes a field. If the field has role FRCat, the top k values (by frequency) are returned.
func (*ChData) Init ¶
Init initializes the Pipeline.
Example ¶
dataPath := os.Getenv("data") // path to data directory fileName := dataPath + "/test1.csv" f, e := os.Open(fileName) if e != nil { panic(e) } // set up chutils file reader rdr := file.NewReader(fileName, ',', '\n', 0, 0, 1, 0, f, 0) e = rdr.Init("", chutils.MergeTree) if e != nil { panic(e) } // determine data types e = rdr.TableSpec().Impute(rdr, 0, .99) if e != nil { panic(e) } bSize := 100 ch := NewChData("Test ch Pipeline", WithBatchSize(bSize), WithReader(rdr), WithCycle(true), WithCats("y", "y1", "y2", "x4"), WithOneHot("yoh", "y"), WithOneHot("y1oh", "y1"), WithOneHot("x4oh", "x4"), WithNormalized("x1", "x2", "x3"), WithOneHot("y2oh", "y2")) // initialize pipeline e = ch.Init() if e != nil { panic(e) } // Target: // rows read: 8500
Output:
func (*ChData) IsNormalized ¶
IsNormalized returns true if the field is normalized.
func (*ChData) SaveFTypes ¶
SaveFTypes saves the FTypes for the Pipeline.
Example ¶
// Field Types (FTypes) can be saved once they're created. This preserves key information like // - The field role // - Location and Scale used in normalization // - Mapping of discrete fields // - Construction of one-hot fields dataPath := os.Getenv("data") // path to data directory fileName := dataPath + "/test1.csv" f, e := os.Open(fileName) if e != nil { panic(e) } // set up chutils file reader rdr := file.NewReader(fileName, ',', '\n', 0, 0, 1, 0, f, 0) e = rdr.Init("", chutils.MergeTree) if e != nil { panic(e) } // determine data types e = rdr.TableSpec().Impute(rdr, 0, .99) if e != nil { panic(e) } bSize := 100 ch := NewChData("Test ch Pipeline", WithBatchSize(bSize), WithReader(rdr), WithCycle(true), WithCats("y", "y1", "y2", "x4"), WithOneHot("yoh", "y"), WithOneHot("y1oh", "y1"), WithOneHot("x4oh", "x4"), WithNormalized("x1", "x2", "x3"), WithOneHot("y2oh", "y2")) // initialize pipeline e = ch.Init() if e != nil { panic(e) } outFile := os.TempDir() + "/seafan.json" if e = ch.SaveFTypes(outFile); e != nil { panic(e) } saveFTypes, e := LoadFTypes(outFile) if e != nil { panic(e) } ch1 := NewChData("Saved FTypes", WithReader(rdr), WithBatchSize(bSize), WithFtypes(saveFTypes)) if e := ch1.Init(); e != nil { panic(e) } fmt.Printf("Role of field y1oh: %s", ch.GetFType("y1oh").Role) // Target: // rows read: 8500 // rows read: 8500 // Role of field y1oh: FROneHot
Output:
type DOLayer ¶
type DOLayer struct { // position int // insert dropout after layer AfterLayer DropProb float64 // dropout probability }
DOLayer specifies a dropout layer. It occurs in the graph after dense layer AfterLayer (the input layer is layer 0).
func DropOutParse ¶
DropOutParse parses the arguments to a drop out layer
type Desc ¶
type Desc struct { Name string // Name is the name of feature we are describing N int // N is the number of observations U []float64 // U is the slice of locations at which to find the quantile Q []float64 // Q is the slice of empirical quantiles Mean float64 // Mean is the average of the data Std float64 // standard deviation }
Desc contains descriptive information of a float64 slice
func Assess ¶
func Assess(xy *XY, cutoff float64) (n int, precision, recall, accuracy float64, obs, fit *Desc, err error)
Assess returns a selection of statistics of the fit
func NewDesc ¶
NewDesc creates a pointer to a new Desc struct instance with error checking.
u is a slice of values at which to find quantiles. If nil, a standard set is used. name is the name of the feature (for printing)(
type FCLayer ¶
type FCLayer struct { Size int Bias bool Act Activation ActParm float64 }
FCLayer has details of a fully connected layer
type FParam ¶
type FParam struct { Location float64 `json:"location"` // location parameter for *Cts Scale float64 `json:"scale"` // scale parameter for *Cts Default any `json:"default"` // default level for *Dscrt Lvl Levels `json:"lvl"` // map of values to int32 category for *Dscrt }
FParam -- field parameters -- is summary data about a field. These values may not be derived from the current data but are applied to the current data.
type FType ¶
type FType struct { Name string Role FRole Cats int EmbCols int Normalized bool From string FP *FParam }
FType represents a single field. It holds key information about the feature: its role, dimensions, summary info.
type FTypes ¶
type FTypes []*FType
func LoadFTypes ¶
LoadFTypes loads a file created by the FTypes Save method
func (FTypes) DropFields ¶
DropFields will drop fields from the FTypes
type Fit ¶
type Fit struct {
// contains filtered or unexported fields
}
Fit struct for fitting a NNModel
func (*Fit) BestEpoch ¶
BestEpoch returns the epoch of the best cost (validation or in-sample--whichever is specified)
func (*Fit) Do ¶
Do is the fitting loop.
Example ¶
Verbose = false bSize := 100 // generate a Pipeline of type *ChData that reads test.csv in the data directory pipe := chPipe(bSize, "test1.csv") // generate model: target and features. Target yoh is one-hot with 2 levels mod := ModSpec{ "Input(x1+x2+x3+x4)", "FC(size:3, activation:relu)", "DropOut(.1)", "FC(size:2, activation:softmax)", "Target(yoh)", } // model is straight-forward with no hidden layers or dropouts. nn, e := NewNNModel(mod, pipe, true, WithCostFn(CrossEntropy)) if e != nil { panic(e) } epochs := 150 ft := NewFit(nn, epochs, pipe) e = ft.Do() if e != nil { panic(e) } // Plot the in-sample cost in a browser (default: firefox) e = ft.InCosts().Plot(&PlotDef{Title: "In-Sample Cost Curve", Height: 1200, Width: 1200, Show: true, XTitle: "epoch", YTitle: "Cost"}, true) if e != nil { panic(e) } // Target:
Output:
Example (Example2) ¶
// This example demonstrates how to use a validation sample for early stopping Verbose = false bSize := 100 // generate a Pipeline of type *ChData that reads test.csv in the data directory mPipe := chPipe(bSize, "test1.csv") vPipe := chPipe(1000, "testVal.csv") // generate model: target and features. Target yoh is one-hot with 2 levels mod := ModSpec{ "Input(x1+x2+x3+x4)", "FC(size:3, activation:relu)", "DropOut(.1)", "FC(size:2, activation:softmax)", "Target(yoh)", } nn, e := NewNNModel(mod, mPipe, true, WithCostFn(CrossEntropy)) if e != nil { panic(e) } epochs := 150 ft := NewFit(nn, epochs, mPipe) WithValidation(vPipe, 10)(ft) e = ft.Do() if e != nil { panic(e) } // Plot the in-sample cost in a browser (default: firefox) e = ft.InCosts().Plot(&PlotDef{Title: "In-Sample Cost Curve", Height: 1200, Width: 1200, Show: true, XTitle: "epoch", YTitle: "Cost"}, true) if e != nil { panic(e) } e = ft.OutCosts().Plot(&PlotDef{Title: "Validation Sample Cost Curve", Height: 1200, Width: 1200, Show: true, XTitle: "epoch", YTitle: "Cost"}, true) if e != nil { panic(e) } // Target:
Output:
type FitOpts ¶
type FitOpts func(*Fit)
FitOpts functions add options
func WithLearnRate ¶
WithLearnRate sets a learning rate function that declines linearly across the epochs.
func WithOutFile ¶
WithOutFile specifies the file root name to save the best model.
func WithShuffle ¶
WithShuffle shuffles after interval epochs Default is 0 (don't shuffle ever)
func WithValidation ¶
WithValidation adds a validation Pipeline for early stopping. The fit is stopped when the validation cost does not improve for wait epochs.
type GData ¶
type GData struct {
// contains filtered or unexported fields
}
func (*GData) FieldCount ¶
FieldCount returns the number of fields in GData
func (*GData) MakeOneHot ¶
MakeOneHot creates & appends a one hot feature from a discrete feature
func (*GData) Sort ¶
Sort sorts the GData on field. Calling Sort.Sort directly will cause a panic. Sorting a OneHot or Embedded field sorts on the underlying Categorical field
type GDatum ¶
type GDatum struct { FT *FType // FT stores the details of the field: it's role, # categories, mappings Summary Summary // Summary of the Data (e.g. distribution) Data any // Data. This will be either []float64 (FRCts, FROneHot, FREmbed) or []int32 (FRCat) }
type Levels ¶
Levels is a map from underlying values if a discrete tensor to int32 values
func ByPtr ¶
ByPtr returns a mapping of values of data to []int32 for modeling. The values of data are sorted, so the smallest will have a mapped value of 0.
type ModSpec ¶
type ModSpec []string
ModSpec holds layers--each slice element is a layer
func LoadModSpec ¶
LoadModSpec loads a ModSpec from file
func (ModSpec) DropOut ¶
DropOut returns the *DoLayer for layer i, if it is of type DropOut. Returns nil o.w.
type NNModel ¶
type NNModel struct {
// contains filtered or unexported fields
}
NNModel structure
func LoadNN ¶
LoadNN restores a previously saved NNModel. fileRoot is the root name of the save file. p is the Pipeline with the field specs. if build is true, DropOut layers are included.
func NewNNModel ¶
NewNNModel creates a new NN model. Specs for fields in modSpec are pulled from pipe. if build is true, DropOut layers are included.
func PredictNN ¶
PredictNN reads in a NNModel from a file and populates it with a batch from p. Methods such as FitSlice and ObsSlice are immediately available.
Example ¶
// This example demonstrates fitting a regression model and predicting on new data Verbose = false bSize := 100 // generate a Pipeline of type *ChData that reads test.csv in the data directory mPipe := chPipe(bSize, "test1.csv") vPipe := chPipe(1000, "testVal.csv") // This model is OLS mod := ModSpec{ "Input(x1+x2+x3+x4)", "FC(size:1)", "Target(ycts)", } // model is straight-forward with no hidden layers or dropouts. nn, e := NewNNModel(mod, mPipe, true, WithCostFn(RMS)) if e != nil { panic(e) } epochs := 150 ft := NewFit(nn, epochs, mPipe) e = ft.Do() if e != nil { panic(e) } sf := os.TempDir() + "/nnTest" e = nn.Save(sf) if e != nil { panic(e) } pred, e := PredictNN(sf, vPipe, false) if e != nil { panic(e) } fmt.Printf("out-of-sample correlation: %0.2f\n", stat.Correlation(pred.FitSlice(), pred.ObsSlice(), nil)) _ = os.Remove(sf + "P.nn") if e != nil { panic(e) } _ = os.Remove(sf + "S.nn") // Target: // out-of-sample correlation: 0.84
Output:
func PredictNNwFts ¶ added in v0.0.11
func PredictNNwFts(fileRoot string, pipe Pipeline, build bool, fts FTypes, opts ...NNOpts) (nn *NNModel, err error)
PredictNNwFts updates the input pipe to have the FTypes specified by fts. For instance, if one has normalized a continuous input, the normalization factor used in the NN must be the same as its build values.
func (*NNModel) OutputCols ¶ added in v0.0.9
OutputCols returns the number of columns in the output
type NNet ¶
type NNet interface { Inputs() G.Nodes // input nodes Features() G.Nodes // predictors Fitted() G.Result // model output Params() G.Nodes // model weights Obs() *G.Node // observed values CostFn() CostFunc // cost function of fitting Cost() *G.Node // cost node in graph Fwd() // forward pass G() *G.ExprGraph // return graph Save(fileRoot string) error // save model }
NNet interface for NN models
type Opts ¶
type Opts func(c Pipeline)
Opts function sets an option to a Pipeline
func WithBatchSize ¶
WithBatchSize sets the batch size for the pipeline
func WithCallBack ¶
WithCallBack sets a callback function.
Example ¶
// This example shows how to create a callback during the fitting phase (fit.Do). // The callback is called at the end of each epoch. The callback below loads a new dataset after // epoch 100. Verbose = false bSize := 100 // generate a Pipeline of type *ChData that reads test.csv in the data directory mPipe := chPipe(bSize, "test1.csv") // This callback function replaces the initial dataset with newData.csv after epoch 2500 cb := func(c Pipeline) { switch d := c.(type) { case *ChData: if d.Epoch(-1) == 100 { dataPath := os.Getenv("data") // path to data directory fileName := dataPath + "/testVal.csv" f, e := os.Open(fileName) if e != nil { panic(e) } rdrx := file.NewReader(fileName, ',', '\n', 0, 0, 1, 0, f, 0) if e := rdrx.Init("", chutils.MergeTree); e != nil { panic(e) } if e := rdrx.TableSpec().Impute(rdrx, 0, .99); e != nil { panic(e) } rows, _ := rdrx.CountLines() fmt.Println("New data at end of epoch ", d.Epoch(-1)) fmt.Println("Number of rows ", rows) WithReader(rdrx)(d) } } } WithCallBack(cb)(mPipe) // This model is OLS mod := ModSpec{ "Input(x1+x2+x3+x4)", "FC(size:1)", "Target(ycts)", } // model is straight-forward with no hidden layers or dropouts. nn, e := NewNNModel(mod, mPipe, true, WithCostFn(RMS)) if e != nil { panic(e) } epochs := 150 ft := NewFit(nn, epochs, mPipe) e = ft.Do() if e != nil { panic(e) } // Target: //New data at end of epoch 100 //Number of rows 1000
Output:
func WithCycle ¶
WithCycle sets the cycle bool. If false, the intent is for the Pipeline to generate a new data set is generated for each epoch.
func WithFtypes ¶
WithFtypes sets the FTypes of the Pipeline. The feature is used to override the default levels.
func WithNormalized ¶
WithNormalized sets the features to be normalized.
func WithOneHot ¶
WithOneHot adds a one-hot field "name" based of field "from"
Example ¶
// This example shows a model that incorporates a feature (x4) as one-hot and an embedding Verbose = false bSize := 100 // generate a Pipeline of type *ChData that reads test.csv in the data directory pipe := chPipe(bSize, "test1.csv") // The feature x4 takes on values 0,1,2,...19. chPipe treats this a continuous feature. // Let's override that and re-initialize the pipeline. WithCats("x4")(pipe) WithOneHot("x4oh", "x4")(pipe) if e := pipe.Init(); e != nil { panic(e) } mod := ModSpec{ "Input(x1+x2+x3+x4oh)", "FC(size:2, activation:softmax)", "Target(yoh)", } // fmt.Println("x4 as one-hot") nn, e := NewNNModel(mod, pipe, true) if e != nil { panic(e) } fmt.Println(nn) fmt.Println("x4 as embedding") mod = ModSpec{ "Input(x1+x2+x3+E(x4oh,3))", "FC(size:2, activation:softmax)", "Target(yoh)", } nn, e = NewNNModel(mod, pipe, true) if e != nil { panic(e) } fmt.Println(nn) // Target: //x4 as one-hot // //Inputs //Field x1 // continuous // //Field x2 // continuous // //Field x3 // continuous // //Field x4oh // one-hot // derived from feature x4 // length 20 // //Target //Field yoh // one-hot // derived from feature y // length 2 // //Model Structure //Input(x1+x2+x3+x4oh) //FC(size:2, activation:softmax) //Target(yoh) // //Batch size: 100 //24 FC parameters //0 Embedding parameters // //x4 as embedding // //Inputs //Field x1 // continuous // //Field x2 // continuous // //Field x3 // continuous // //Field x4oh // embedding // derived from feature x4 // length 20 // embedding dimension of 3 // //Target //Field yoh // one-hot // derived from feature y // length 2 // //Model Structure //Input(x1+x2+x3+E(x4oh,3)) //FC(size:2, activation:softmax) //Target(yoh) // //Batch size: 100 //7 FC parameters //60 Embedding parameters
Output:
Example (Example2) ¶
// This example incorporates a drop out layer Verbose = false bSize := 100 // generate a Pipeline of type *ChData that reads test.csv in the data directory pipe := chPipe(bSize, "test1.csv") // generate model: target and features. Target yoh is one-hot with 2 levels mod := ModSpec{ "Input(x1+x2+x3+x4)", "FC(size:3, activation:relu)", "DropOut(.1)", "FC(size:2, activation:softmax)", "Target(yoh)", } nn, e := NewNNModel(mod, pipe, true, WithCostFn(CrossEntropy), WithName("Example With Dropouts")) if e != nil { panic(e) } fmt.Println(nn) // Target: //Example With Dropouts //Inputs //Field x1 // continuous // //Field x2 // continuous // //Field x3 // continuous // //Field x4 // continuous // //Target //Field yoh // one-hot // derived from feature y // length 2 // //Model Structure //Input(x1+x2+x3+x4) //FC(size:3, activation:relu) //DropOut(.1) //FC(size:2, activation:softmax) //Target(yoh) // //Cost function: CrossEntropy // //Batch size: 100 //19 FC parameters //0 Embedding parameters
Output:
type Pipeline ¶
type Pipeline interface { Init() error // initialize the pipeline Rows() int // # of observations in the pipeline (size of the epoch) Batch(inputs G.Nodes) bool // puts the next batch in the input nodes Epoch(setTo int) int // manage epoch count IsNormalized(field string) bool // true if feature is normalized IsCat(field string) bool // true if feature is one-hot encoded Cols(field string) int // # of columns in the feature IsCts(field string) bool // true if the feature is continuous GetFType(field string) *FType // Get FType for the feature GetFTypes() FTypes // Get Ftypes for pipeline BatchSize() int // batch size FieldList() []string // fields available GData() *GData // return underlying GData Get(field string) *GDatum // return data for field Slice(sl Slicer) (Pipeline, error) // slice the pipeline Shuffle() // shuffle data Describe(field string, topK int) string // describes a field }
The Pipeline interface specifies the methods required to be a data Pipeline. The Pipeline is the middleware between the data and the fitting routines.
type PlotDef ¶
type PlotDef struct { Show bool // Show - true = show graph in browser Title string // Title - plot title XTitle string // XTitle - x-axis title YTitle string // Ytitle - y-axis title STitle string // STitle - sub-title (under the x-axis) Legend bool // Legend - true = show legend Height float64 // Height - height of graph, in pixels Width float64 // Width - width of graph, in pixels FileName string // FileName - output file for graph (in html) }
PlotDef specifies Plotly Layout features I commonly use.
type Raw ¶
Raw holds a raw slice of type Kind
func NewRaw ¶
NewRaw creates a new raw slice from x. This assumes all elements of x are the same Kind
func NewRawCast ¶
type Slice ¶
type Slice struct {
// contains filtered or unexported fields
}
Slice implements generating Slicer functions for a feature. These are used to slice through the values of a discrete feature. For continuous features, it slices by quartile.
func NewSlice ¶
NewSlice makes a new Slice based on feat in Pipeline pipe. minCnt is the minimum # of obs a slice must have to be used. Restrict is a slice of values to restrict Iter to.
func (*Slice) Iter ¶
Iter iterates through the levels (ranges) of the feature. Returns false when done.
Example ¶
// An example of slicing through the data to generate diagnostics on subsets. // The code here will generate a decile plot for each of the 20 levels of x4. Verbose = false bSize := 100 // generate a Pipeline of type *ChData that reads test.csv in the data directory pipe := chPipe(bSize, "test1.csv") // The feature x4 takes on values 0,1,2,...19. chPipe treats this a continuous feature. // Let's override that and re-initialize the pipeline. WithCats("x4")(pipe) WithOneHot("x4oh", "x4")(pipe) if e := pipe.Init(); e != nil { panic(e) } mod := ModSpec{ "Input(x1+x2+x3+x4oh)", "FC(size:2, activation:softmax)", "Target(yoh)", } nn, e := NewNNModel(mod, pipe, true) if e != nil { panic(e) } WithCostFn(CrossEntropy)(nn) ft := NewFit(nn, 100, pipe) if e = ft.Do(); e != nil { panic(e) } sf := os.TempDir() + "/nnTest" e = nn.Save(sf) if e != nil { panic(e) } WithBatchSize(8500)(pipe) pred, e := PredictNN(sf, pipe, false) if e != nil { panic(e) } _ = os.Remove(sf + "P.nn") _ = os.Remove(sf + "S.nn") s, e := NewSlice("x4", 0, pipe, nil) if e != nil { panic(e) } for s.Iter() { slicer := s.MakeSlicer() xy, e := Coalesce(pred.ObsSlice(), pred.FitSlice(), 2, []int{1}, false, slicer) if e != nil { panic(e) } if e := Decile(xy, &PlotDef{ Title: "Decile: " + s.Title(), XTitle: "Score", YTitle: "Actual", STitle: "", Legend: false, Height: 1200, Width: 1200, Show: true, FileName: "", }); e != nil { panic(e) } } // Target:
Output:
func (*Slice) MakeSlicer ¶
MakeSlicer makes a Slicer function for the current value (discrete) or range (continuous) of the feature. Continuous features are sliced at the lower quartile, median and upper quartile, producing 4 slices.
type Slicer ¶
Slicer is an optional function that returns true if the row is to be used in calculations. This is used to subset the diagnostics to specific values.
type Summary ¶
type Summary struct { NRows int // size of the data DistrC *Desc // summary of continuous field DistrD Levels // summary of discrete field }
Summary has descriptive statistics of a field using its current data.
type VecData ¶
type VecData struct {
// contains filtered or unexported fields
}
func (*VecData) Describe ¶
Describe describes a field. If the field has role FRCat, the top k values (by frequency) are returned.
func (*VecData) IsNormalized ¶
IsNormalized returns true if the field is normalized.
func (*VecData) SaveFTypes ¶
SaveFTypes saves the FTypes for the Pipeline.
type XY ¶
XY struct holds (x,y) pairs as distinct slices
func Coalesce ¶
Coalesce reduces a softmax output to two categories
y observed multinomial values fit softmax fit to y nCat # of categories trg columns of y to be grouped into a single outcome. The complement is reduced to the alternate outcome. logodds if true, fit is in log odds space
An XY struct of the coalesced outcome (Y) & fitted values (X) is returned