Documentation ¶
Overview ¶
Package seafan is a set of tools for building DNN models. The build engine is gorgonia (https://pkg.go.dev/gorgonia.org/gorgonia).
Seafan features:
- A data pipeline based on chutils (https://github.com/invertedv/chutils) to access files and ClickHouse tables.
- Point-and-shoot specification of the data
- Simple specification of one-hot features
- A wrapper around gorgonia that meshes to the pipeline.
- Simple specification of models, including embeddings
- A fit method with optional early stopping and callbacks
- Saving and loading models
- Model diagnostics for categorical targets.
- KS plots
- Decile plots
- Utilities.
- Plotting wrapper for plotly (https://github.com/MetalBlueberry/go-plotly) for xy plots.
- Numeric struct for (x,y) data and plotting and descriptive statistics.
Index ¶
- Variables
- func AddFitted(pipeIn Pipeline, nnFile string, target []int, name string, fts FTypes, ...) error
- func Coalesce(vals []float64, nCat int, trg []int, binary, logodds bool, sl Slicer) ([]float64, error)
- func CrossEntropy(model *NNModel) (cost *G.Node)
- func Decile(xyIn *XY, plt *utilities.PlotDef) error
- func EvalSFunction(node *OpNode) error
- func Evaluate(curNode *OpNode, pipe Pipeline) error
- func Expr2Tree(curNode *OpNode) error
- func GetNode(ns G.Nodes, name string) *G.Node
- func KS(xy *XY, plt *utilities.PlotDef) (ks float64, notTarget *Desc, target *Desc, err error)
- func LeakyReluAct(n *G.Node, alpha float64) *G.Node
- func LinearAct(n *G.Node) *G.Node
- func Loop(loopVar string, start, end int, inner []*OpNode, assign []string, ...) error
- func Marginal(nnFile string, feat string, target []int, pipe Pipeline, pd *utilities.PlotDef, ...) error
- func PipeToCSV(pipe Pipeline, outFile string, sep, eol, quote rune) error
- func PipeToSQL(pipe Pipeline, table string, after int, conn *chutils.Connect) error
- func R2(y, yhat []float64) float64
- func RMS(model *NNModel) (cost *G.Node)
- func ReluAct(n *G.Node) *G.Node
- func SegPlot(pipe Pipeline, obs, fit, seg string, plt *utilities.PlotDef, ...) error
- func SigmoidAct(n *G.Node) *G.Node
- func SoftMaxAct(n *G.Node) *G.Node
- func SoftRMS(model *NNModel) (cost *G.Node)
- func Strip(s string) (left, inner string, err error)
- func UnNormalize(vals []float64, ft *FType) (unNorm []float64)
- func Unique(xs []any) []any
- func Wrapper(e error, text string) error
- type Activation
- type Args
- type ChData
- func (ch *ChData) AppendRows(gd *GData, fTypes FTypes) (pipeOut Pipeline, err error)
- func (ch *ChData) AppendRowsRaw(gd *GData) error
- func (ch *ChData) Batch(inputs G.Nodes) bool
- func (ch *ChData) BatchSize() int
- func (ch *ChData) Cols(field string) int
- func (ch *ChData) Describe(field string, topK int) string
- func (ch *ChData) Drop(field string) error
- func (ch *ChData) Epoch(setTo int) int
- func (ch *ChData) FieldCount() int
- func (ch *ChData) FieldList() []string
- func (ch *ChData) GData() *GData
- func (ch *ChData) Get(field string) *GDatum
- func (ch *ChData) GetFType(field string) *FType
- func (ch *ChData) GetFTypes() FTypes
- func (ch *ChData) GetKeepRaw() bool
- func (ch *ChData) Init() (err error)
- func (ch *ChData) IsCat(field string) bool
- func (ch *ChData) IsCts(field string) bool
- func (ch *ChData) IsNormalized(field string) bool
- func (ch *ChData) IsSorted() bool
- func (ch *ChData) Join(right Pipeline, onField string, joinType JoinType) (result Pipeline, err error)
- func (ch *ChData) Keep(fields []string) error
- func (ch *ChData) Name() string
- func (ch *ChData) ReInit(ftypes *FTypes) (pipeOut Pipeline, err error)
- func (ch *ChData) Row(take int) (newPipe Pipeline, err error)
- func (ch *ChData) Rows() int
- func (ch *ChData) SaveFTypes(fileName string) error
- func (ch *ChData) Shuffle()
- func (ch *ChData) Slice(sl Slicer) (Pipeline, error)
- func (ch *ChData) Sort(field string, ascending bool) error
- func (ch *ChData) SortField() string
- func (ch *ChData) String() string
- func (ch *ChData) Subset(rows []int) (newPipe Pipeline, err error)
- func (ch *ChData) Where(field string, equalTo []any) (newPipe Pipeline, err error)
- type CostFunc
- type DOLayer
- type Desc
- type FCLayer
- type FParam
- type FRole
- type FType
- type FTypes
- type Fit
- type FitOpts
- type FuncSpec
- type GData
- func (gd *GData) AddRaw(data [][]any, fields []string, fts FTypes, keepRaw bool) error
- func (gd *GData) AppendC(raw *Raw, name string, normalize bool, fp *FParam, keepRaw bool) error
- func (gd *GData) AppendD(raw *Raw, name string, fp *FParam, keepRaw bool) error
- func (gd *GData) AppendField(newData *Raw, name string, fRole FRole, keepRaw bool) error
- func (gd *GData) AppendRows(gdApp *GData, fTypes FTypes) (gdOut *GData, err error)
- func (gd *GData) AppendRowsRaw(gdApp *GData) error
- func (gd *GData) Back2Raw() (rawData []*Raw, nCol int, fields []string, err error)
- func (gd *GData) Close() error
- func (gd *GData) Copy() (gdOut *GData, err error)
- func (gd *GData) CountLines() (numLines int, err error)
- func (gd *GData) Drop(field string) error
- func (gd *GData) FieldCount() int
- func (gd *GData) FieldList() []string
- func (gd *GData) Get(name string) *GDatum
- func (gd *GData) GetData() []*GDatum
- func (gd *GData) GetFType(field string) *FType
- func (gd *GData) GetFTypes() FTypes
- func (gd *GData) GetRaw(field string) (*Raw, error)
- func (gd *GData) IsSorted() bool
- func (gd *GData) Join(right *GData, onField string, joinType JoinType) (result *GData, err error)
- func (gd *GData) Keep(fields []string) error
- func (gd *GData) Len() int
- func (gd *GData) Less(i, j int) bool
- func (gd *GData) MakeOneHot(from, name string) error
- func (gd *GData) ReInit(fTypes *FTypes) (gdOut *GData, err error)
- func (gd *GData) Read(nTarget int, validate bool) (data []chutils.Row, valid []chutils.Valid, err error)
- func (gd *GData) Reset() error
- func (gd *GData) Row(take int) (gdNew *GData, err error)
- func (gd *GData) Rows() int
- func (gd *GData) Seek(lineNo int) error
- func (gd *GData) Shuffle()
- func (gd *GData) Slice(sl Slicer) (*GData, error)
- func (gd *GData) Sort(field string, ascending bool) error
- func (gd *GData) SortField() string
- func (gd *GData) Subset(keepRows []int) (gdOut *GData, err error)
- func (gd *GData) Swap(i, j int)
- func (gd *GData) TableSpec() *chutils.TableDef
- func (gd *GData) UpdateFts(newFts FTypes) (*GData, error)
- func (gd *GData) Where(field string, equalTo []any) (gdOut *GData, err error)
- type GDatum
- type JoinType
- type Layer
- type Levels
- type ModSpec
- func (m ModSpec) Check() error
- func (m ModSpec) DropOut(loc int) *DOLayer
- func (m ModSpec) FC(loc int) *FCLayer
- func (m ModSpec) Inputs(p Pipeline) (FTypes, error)
- func (m ModSpec) LType(i int) (*Layer, error)
- func (m ModSpec) Save(fileName string) (err error)
- func (m ModSpec) Target(p Pipeline) (*FType, error)
- func (m ModSpec) TargetName() string
- type NNModel
- func LoadNN(fileRoot string, p Pipeline, build bool) (nn *NNModel, err error)
- func NewNNModel(modSpec ModSpec, pipe Pipeline, build bool, nnOpts ...NNOpts) (*NNModel, error)
- func PredictNN(fileRoot string, pipe Pipeline, build bool, opts ...NNOpts) (nn *NNModel, err error)
- func PredictNNwFts(fileRoot string, pipe Pipeline, build bool, fts FTypes, opts ...NNOpts) (nn *NNModel, err error)
- func (m *NNModel) Cost() *G.Node
- func (m *NNModel) CostFlt() float64
- func (m *NNModel) CostFn() CostFunc
- func (m *NNModel) Features() G.Nodes
- func (m *NNModel) FitSlice() []float64
- func (m *NNModel) Fitted() G.Result
- func (m *NNModel) Fwd()
- func (m *NNModel) G() *G.ExprGraph
- func (m *NNModel) InputFT() FTypes
- func (m *NNModel) Inputs() G.Nodes
- func (m *NNModel) ModSpec() ModSpec
- func (m *NNModel) Name() string
- func (m *NNModel) Obs() *G.Node
- func (m *NNModel) ObsSlice() []float64
- func (m *NNModel) Opts() []NNOpts
- func (m *NNModel) OutputCols() int
- func (m *NNModel) Params() G.Nodes
- func (m *NNModel) Save(fileRoot string) (err error)
- func (m *NNModel) String() string
- type NNOpts
- type OpNode
- type Opts
- func WithBatchSize(bsize int) Opts
- func WithCallBack(cb Opts) Opts
- func WithCats(names ...string) Opts
- func WithCycle(cycle bool) Opts
- func WithFtypes(fts FTypes) Opts
- func WithKeepRaw(keepRaw bool) Opts
- func WithNormalized(names ...string) Opts
- func WithOneHot(name, from string) Opts
- func WithReader(rdr any) Opts
- type Pipeline
- func AddToPipe(rootNode *OpNode, fieldName string, pipe Pipeline) (outPipe Pipeline, err error)
- func Append(pipe1, pipe2 Pipeline) (Pipeline, error)
- func CSVToPipe(csvFile string, fts FTypes, keepRaw bool) (pipe Pipeline, err error)
- func SQLToPipe(sql string, fts FTypes, keepRaw bool, conn *chutils.Connect) (pipe Pipeline, err error)
- func VecFromAny(data [][]any, fields []string, ftypes FTypes) (pipe Pipeline, err error)
- type Raw
- func (r *Raw) CumeAfter(aggType string) (*Raw, error)
- func (r *Raw) CumeBefore(aggType string) (*Raw, error)
- func (r *Raw) Exp() (*Raw, error)
- func (r *Raw) Index(indices *Raw) (*Raw, error)
- func (r *Raw) IsNumeric() bool
- func (r *Raw) Lag(missing any) (*Raw, error)
- func (r *Raw) Len() int
- func (r *Raw) Less(i, j int) bool
- func (r *Raw) Log() (*Raw, error)
- func (r *Raw) Max() (*Raw, error)
- func (r *Raw) Mean() (*Raw, error)
- func (r *Raw) Min() (*Raw, error)
- func (r *Raw) Pow(exponent *Raw) (*Raw, error)
- func (r *Raw) Product() (*Raw, error)
- func (r *Raw) Std() (*Raw, error)
- func (r *Raw) Sum() (*Raw, error)
- func (r *Raw) Swap(i, j int)
- type SeaError
- type Slice
- type Slicer
- type Summary
- type VecData
- func (vec *VecData) AppendRows(gd *GData, fTypes FTypes) (pipeOut Pipeline, err error)
- func (vec *VecData) AppendRowsRaw(gd *GData) error
- func (vec *VecData) Batch(inputs G.Nodes) bool
- func (vec *VecData) BatchSize() int
- func (vec *VecData) Cols(field string) int
- func (vec *VecData) Describe(field string, topK int) string
- func (vec *VecData) Drop(field string) error
- func (vec *VecData) Epoch(setTo int) int
- func (vec *VecData) FieldCount() int
- func (vec *VecData) FieldList() []string
- func (vec *VecData) GData() *GData
- func (vec *VecData) Get(field string) *GDatum
- func (vec *VecData) GetFType(field string) *FType
- func (vec *VecData) GetFTypes() FTypes
- func (vec *VecData) GetKeepRaw() bool
- func (vec *VecData) Init() error
- func (vec *VecData) IsCat(field string) bool
- func (vec *VecData) IsCts(field string) bool
- func (vec *VecData) IsNormalized(field string) bool
- func (vec *VecData) IsSorted() bool
- func (vec *VecData) Join(right Pipeline, onField string, joinType JoinType) (result Pipeline, err error)
- func (vec *VecData) Keep(fields []string) error
- func (vec *VecData) Name() string
- func (vec *VecData) ReInit(ftypes *FTypes) (pipeOut Pipeline, err error)
- func (vec *VecData) Row(take int) (newPipe Pipeline, err error)
- func (vec *VecData) Rows() int
- func (vec *VecData) SaveFTypes(fileName string) error
- func (vec *VecData) Shuffle()
- func (vec *VecData) Slice(sl Slicer) (Pipeline, error)
- func (vec *VecData) Sort(field string, ascending bool) error
- func (vec *VecData) SortField() string
- func (vec *VecData) String() string
- func (vec *VecData) Subset(rows []int) (newPipe Pipeline, err error)
- func (vec *VecData) Where(field string, equalTo []any) (newPipe Pipeline, err error)
- type XY
Examples ¶
Constants ¶
This section is empty.
Variables ¶
var ( // FunctionsStr lists the functions that parser supports, the number and types of arguments, type of return //go:embed strings/functions.txt FunctionsStr string // Functions is a slice that describes all supported functions/operations Functions []FuncSpec Height = 1200.0 Width = 1200.0 )
var Browser = "firefox"
Browser is the browser to use for plotting.
var Verbose = true
Verbose controls amount of printing.
Functions ¶
func AddFitted ¶
func AddFitted(pipeIn Pipeline, nnFile string, target []int, name string, fts FTypes, logodds bool, obsFit *FType) error
AddFitted addes fitted values to a Pipeline. The features can be re-normalized/re-mapped to align pipeIn with the model build pipeIn -- input Pipeline to run the model on nnFile -- root directory of NNModel target -- target columns of the model output to coalesce name -- name of fitted value in Pipeline fts -- options FTypes to use for normalizing pipeIn
func Coalesce ¶
func Coalesce(vals []float64, nCat int, trg []int, binary, logodds bool, sl Slicer) ([]float64, error)
Coalesce combines columns of either a one-hot feature or a softmax output. In the case of a feature, it returns 1 if any of the target columns is 1. In the case of a softmax output, it sums the entries.
func Decile ¶
Decile generates a decile plot based on xy
XY values to base the plot on. plt PlotDef plot options. If plt is nil an error is generated.
The deciles are created based on the values of xy.X
func EvalSFunction ¶ added in v0.1.1
EvalSFunction evaluates a summary function. A summary function returns a single value.
func Evaluate ¶ added in v0.1.0
Evaluate evaluates an expression parsed by Expr2Tree. The user calls Evaluate with the top node as returned by Expr2Tree To add a field to a pipeline:
- Create the *OpNode tree to evaluate the expression using Expr2Tree
- Populate the values from a Pipeline using Evaluate.
- Add the values to the Pipeline using AddToPipe
Note, you can access the values after Evaluate without adding the field to the Pipeline from the *Raw item of the root node.
Example ¶
This example shows how to print a result
Verbose = false // builds a Pipline with two fields: // c = 1,2,3,4 // D = 5,-5,3,6 pipe := buildPipe([]string{"1,2,3,4", "'a', 'b', 'c', 'd'"}, []string{"f", "s"}) field := &OpNode{Expression: "print(c, 0)"} if e := Expr2Tree(field); e != nil { panic(e) } if e := Evaluate(field, pipe); e != nil { panic(e) }
Output: c 0: 1 1: 2 2: 3 3: 4
Example (DateAdd) ¶
Simple date arithmetic is possible. The function dateAdd(d,m) adds m months to d. The data is: row, newField1, newField2, newField3, date 0,row0,.1,.2, 3/1/2023 2,row2,2.1,3.2, 4/1/2023 3,row3,3.1,4.2, 5/1/2023 4,row4,4.1,5.2, 6/1/2023 1,row1,1.1,2.2, 7/1/2023 100,row100,4100.0,5200.0, 8/1/2020
Verbose = false var ( outPipe Pipeline err error ) data := os.Getenv("data") pipe, e := CSVToPipe(data+"/pipeTest2.csv", nil, false) if e != nil { panic(e) } root := &OpNode{Expression: "dateAdd(date,row)"} if err = Expr2Tree(root); err != nil { panic(err) } if err = Evaluate(root, pipe); err != nil { panic(err) } if outPipe, err = AddToPipe(root, "nextMonth", pipe); err != nil { panic(err) } fmt.Println("date + row months") raw, e := outPipe.GData().GetRaw("nextMonth") if e != nil { panic(e) } fmt.Println(raw.Data)
Output: date + row months [2023-03-01 00:00:00 +0000 UTC 2023-06-01 00:00:00 +0000 UTC 2023-08-01 00:00:00 +0000 UTC 2023-10-01 00:00:00 +0000 UTC 2023-08-01 00:00:00 +0000 UTC 2028-12-01 00:00:00 +0000 UTC]
Example (If) ¶
var ( outPipe Pipeline err error ) Verbose = false data := os.Getenv("data") pipe, e := CSVToPipe(data+"/pipeTest2.csv", nil, false) if e != nil { panic(e) } root := &OpNode{Expression: "if(date=='3/1/2023',1,0)"} if err = Expr2Tree(root); err != nil { panic(err) } if err = Evaluate(root, pipe); err != nil { panic(err) } if outPipe, err = AddToPipe(root, "march2023", pipe); err != nil { panic(err) } fmt.Println(pipe.Get("march2023").Data.([]float64)) root = &OpNode{Expression: "if(date>'3/1/2023',1,0)"} if err = Expr2Tree(root); err != nil { panic(err) } if err = Evaluate(root, pipe); err != nil { panic(err) } if outPipe, err = AddToPipe(root, "afterMarch2023", pipe); err != nil { panic(err) } fmt.Println(outPipe.Get("afterMarch2023").Data.([]float64))
Output: [1 0 0 0 0 0] [0 1 1 1 1 0]
func Expr2Tree ¶ added in v0.1.0
Expr2Tree builds the OpNode tree that is a binary tree representation of an expression. The process to add a field to a Pipeline is:
- Create the *OpNode tree using Expr2Tree to evaluate the expression
- Populate the values from a Pipeline using Evaluate.
- Add the values to the Pipeline using AddToPipe.
Note, you can access the values after Evaluate without adding the field to the Pipeline from the Raw field of the root node.
The expression can include:
- arithmetic operators: +, -, *, /
- exponentation: ^
- functions
- logicals: &&, ||. These evaluate to 0 or 1.
- if statements: if(condition, value if true, value if false). The true value is applied if the condition evaluates to a positive value.
- parentheses
func KS ¶
KS finds the KS of a softmax model that is reduced to a binary outcome.
xy XY struct where x is fitted value and y is the binary observed value plt PlotDef plot options. If plt is nil, no plot is produced.
The ks statistic is returned as are Desc descriptions of the model for the two groups. Returns
ks KS statistic notTarget Desc struct of fitted values of the non-target outcomes target Desc struct of fitted values of target outcomes
Target: html plot file and/or plot in browser.
func LeakyReluAct ¶
LeakyReluAct is leaky relu activation
func Loop ¶ added in v0.1.2
Loop enables looping in parse. The ops in inner are run for each iteration.
- inner - is a slice of *OpNode expressions to run in the loop and then assign to "assign" in the pipeline
- loopVar - the name of the loop variable. This may be used in the "inner" expressions. It is not added to the pipeline.
- loopVar takes on values from start to end.
Example ¶
In this example, we calculate four expressions and return them to the pipeline. The field c is added to itself each iteration. The field r stores the loop field. On return, it will have the last value of the loop.
The fields are evaulated in order, starting with the 0 element of inner.
Verbose = false // builds a Pipline with two fields: // c = 1,2,3,4 // D = 5,-5,3,6 pipe := buildPipe([]string{"1,2,3,4", "5,-5,3,6"}, []string{"f", "f"}) // we'll add two fields to the pipeline: the sum=c+d and max=max(c,d) // start by parsing the expressions. field1, result1 := &OpNode{Expression: "c+c"}, "c" field2, result2 := &OpNode{Expression: "indx"}, "r" // indx will be the name of the looping field. field3, result3 := &OpNode{Expression: "D*c"}, "s" field4, result4 := &OpNode{Expression: "s-1"}, "t" inner := []*OpNode{field1, field2, field3, field4} assign := []string{result1, result2, result3, result4} for ind := 0; ind < len(assign); ind++ { if e := Expr2Tree(inner[ind]); e != nil { panic(e) } } if e := Loop("indx", 1, 3, inner, assign, pipe); e != nil { panic(e) } for ind := 0; ind < len(assign); ind++ { fmt.Println(assign[ind]) fmt.Println(pipe.Get(assign[ind]).Data.([]float64)) }
Output: c [4 8 12 16] r [2 2 2 2] s [20 -40 36 96] t [19 -41 35 95]
func Marginal ¶
func Marginal(nnFile string, feat string, target []int, pipe Pipeline, pd *utilities.PlotDef, obsFtype *FType) error
Marginal produces a set of plots to aid in understanding the effect of a feature. The plot takes the model output and creates six segments based on the quantiles of the model output: (<.1, .1-.25, .25-.5, .5-.75, .75-.9, .9-1).
For each segment, the feature being analyzed various across its range within the quartile (continuous) its values (discrete). The bottom row shows the distribution of the feature within the quartile range.
func SegPlot ¶ added in v0.0.29
func SegPlot(pipe Pipeline, obs, fit, seg string, plt *utilities.PlotDef, minVal, maxVal *float64) error
SegPlot generates a decile plot of the fields y and fit in pipe. The segments are based on the values of the field seg. If seg is continuous, the segments are based on quantiles: 0-.1, .1-.25, .25-.5, .5-.75, .9-1
obs observed field (y-axis) name fit fitted field (x-axis) name seg segmenting field name plt PlotDef plot options. If plt is nil an error is generated.
func SoftMaxAct ¶
SoftMaxAct implements softmax activation functin
func Strip ¶
Strip is a utility that takes a string of the form "Func(args)" and returns "Func" and "args"
func UnNormalize ¶ added in v0.0.29
UnNormalize un-normalizes a slice, if need be
Types ¶
type Activation ¶
type Activation int
Activation types
const ( Linear Activation = 0 + iota Relu LeakyRelu Sigmoid SoftMax )
func StrAct ¶
func StrAct(s string) (*Activation, float64)
StrAct takes a string and returns corresponding Activation and any parameter. Nil if fails.
func (Activation) String ¶
func (i Activation) String() string
type Args ¶
Args map holds layer arguments in key/val style
type ChData ¶
type ChData struct {
// contains filtered or unexported fields
}
ChData provides a Pipeline interface into text files (delimited, fixed length) and ClickHouse.
func (*ChData) AppendRows ¶ added in v0.2.8
AppendRows appends rows to the existing GData and then re-initializes each GDatum, using the fTypes, if provided.
func (*ChData) AppendRowsRaw ¶ added in v0.2.8
AppendRowsRaw simply appends rows, in place, to the existing GData. Only the *Raw data is updated. The .Data field is set to nil.
func (*ChData) Batch ¶
Batch loads a batch into Inputs. It returns false if the epoch is done. If cycle is true, it will start at the beginning on the next call. If cycle is false, it will call Init() at the next call to Batch()
Example ¶
dataPath := os.Getenv("data") // path to data directory fileName := dataPath + "/test1.csv" f, e := os.Open(fileName) if e != nil { panic(e) } // set up chutils file reader rdr := file.NewReader(fileName, ',', '\n', 0, 0, 1, 0, f, 0) e = rdr.Init("", chutils.MergeTree) if e != nil { panic(e) } // determine data types e = rdr.TableSpec().Impute(rdr, 0, .99) if e != nil { panic(e) } bSize := 100 ch := NewChData("Test ch Pipeline", WithBatchSize(bSize), WithReader(rdr), WithNormalized("x1")) // create a graph & node to illustrate Batch() g := G.NewGraph() node := G.NewTensor(g, G.Float64, 2, G.WithName("x1"), G.WithShape(bSize, 1), G.WithInit(G.Zeroes())) var sumX = 0.0 n := 0 // run through batchs and verify counts and mean of x1 is zero for ch.Batch(G.Nodes{node}) { n += bSize x := node.Value().Data().([]float64) for _, xv := range x { sumX += xv } } mean := sumX / float64(n) fmt.Printf("mean of x1: %0.2f", math.Abs(mean))
Output: rows read: 8500 mean of x1: 0.00
Example (Example2) ¶
// We can normalize fields by values we supply rather than the values in the epoch. dataPath := os.Getenv("data") // path to data directory fileName := dataPath + "/test1.csv" f, e := os.Open(fileName) if e != nil { panic(e) } // set up chutils file reader rdr := file.NewReader(fileName, ',', '\n', 0, 0, 1, 0, f, 0) e = rdr.Init("", chutils.MergeTree) if e != nil { panic(e) } // determine data types e = rdr.TableSpec().Impute(rdr, 0, .99) if e != nil { panic(e) } bSize := 100 // Let's normalize x1 with location=41 and scale=1 ft := &FType{ Name: "x1", Role: 0, Cats: 0, EmbCols: 0, Normalized: true, From: "", FP: &FParam{Location: 40, Scale: 1}, } ch := NewChData("Test ch Pipeline", WithBatchSize(bSize), WithReader(rdr)) WithFtypes(FTypes{ft})(ch) // create a graph & node to illustrate Batch() g := G.NewGraph() node := G.NewTensor(g, G.Float64, 2, G.WithName("x1"), G.WithShape(bSize, 1), G.WithInit(G.Zeroes())) sumX := 0.0 n := 0 // run through batchs and verify counts and mean of x1 is zero for ch.Batch(G.Nodes{node}) { n += bSize x := node.Value().Data().([]float64) for _, xv := range x { sumX += xv } } mean := sumX / float64(n) fmt.Printf("mean of x1: %0.2f", math.Abs(mean))
Output: rows read: 8500 mean of x1: 39.50
func (*ChData) Describe ¶
Describe describes a field. If the field has role FRCat, the top k values (by frequency) are returned.
func (*ChData) FieldCount ¶ added in v0.2.7
FieldCount returns the number of fields in the pipeline
func (*ChData) GetKeepRaw ¶ added in v0.2.0
GetKeepRaw returns true if *Raw data is retained
func (*ChData) Init ¶
Init initializes the Pipeline.
Example ¶
dataPath := os.Getenv("data") // path to data directory fileName := dataPath + "/test1.csv" f, e := os.Open(fileName) if e != nil { panic(e) } // set up chutils file reader rdr := file.NewReader(fileName, ',', '\n', 0, 0, 1, 0, f, 0) e = rdr.Init("", chutils.MergeTree) if e != nil { panic(e) } // determine data types e = rdr.TableSpec().Impute(rdr, 0, .99) if e != nil { panic(e) } bSize := 100 ch := NewChData("Test ch Pipeline", WithBatchSize(bSize), WithReader(rdr), WithCycle(true), WithCats("y", "y1", "y2", "x4"), WithOneHot("yoh", "y"), WithOneHot("y1oh", "y1"), WithOneHot("x4oh", "x4"), WithNormalized("x1", "x2", "x3"), WithOneHot("y2oh", "y2")) // initialize pipeline e = ch.Init() if e != nil { panic(e) }
Output: rows read: 8500
func (*ChData) IsNormalized ¶
IsNormalized returns true if the field is normalized.
func (*ChData) ReInit ¶ added in v0.2.8
ReInit re-initializes the Data field from Raw for each GDatum. If ftypes is not nil, these values are used, otherwise the FParam values are re-derived from the data. A new pipeline is returned.
func (*ChData) SaveFTypes ¶
SaveFTypes saves the FTypes for the Pipeline.
Example ¶
// Field Types (FTypes) can be saved once they're created. This preserves key information like // - The field role // - Location and Scale used in normalization // - Mapping of discrete fields // - Construction of one-hot fields dataPath := os.Getenv("data") // path to data directory fileName := dataPath + "/test1.csv" f, e := os.Open(fileName) if e != nil { panic(e) } // set up chutils file reader rdr := file.NewReader(fileName, ',', '\n', 0, 0, 1, 0, f, 0) e = rdr.Init("", chutils.MergeTree) if e != nil { panic(e) } // determine data types e = rdr.TableSpec().Impute(rdr, 0, .99) if e != nil { panic(e) } bSize := 100 ch := NewChData("Test ch Pipeline", WithBatchSize(bSize), WithReader(rdr), WithCycle(true), WithCats("y", "y1", "y2", "x4"), WithOneHot("yoh", "y"), WithOneHot("y1oh", "y1"), WithOneHot("x4oh", "x4"), WithNormalized("x1", "x2", "x3"), WithOneHot("y2oh", "y2")) // initialize pipeline e = ch.Init() if e != nil { panic(e) } outFile := os.TempDir() + "/seafan.json" if e = ch.SaveFTypes(outFile); e != nil { panic(e) } saveFTypes, e := LoadFTypes(outFile) if e != nil { panic(e) } ch1 := NewChData("Saved FTypes", WithReader(rdr), WithBatchSize(bSize), WithFtypes(saveFTypes)) if e := ch1.Init(); e != nil { panic(e) } fmt.Printf("Role of field y1oh: %s", ch.GetFType("y1oh").Role)
Output: rows read: 8500 rows read: 8500 Role of field y1oh: FROneHot
type DOLayer ¶
type DOLayer struct { // position int // insert dropout after layer AfterLayer DropProb float64 // dropout probability }
DOLayer specifies a dropout layer. It occurs in the graph after dense layer AfterLayer (the input layer is layer 0).
func DropOutParse ¶
DropOutParse parses the arguments to a drop out layer
type Desc ¶
type Desc struct { Name string // Name is the name of feature we are describing N int // N is the number of observations U []float64 // U is the slice of locations at which to find the quantile Q []float64 // Q is the slice of empirical quantiles Mean float64 // Mean is the average of the data Std float64 // standard deviation }
Desc contains descriptive information of a float64 slice
func Assess ¶
func Assess(xy *XY, cutoff float64) (n int, precision, recall, accuracy float64, obs, fit *Desc, err error)
Assess returns a selection of statistics of the fit
func NewDesc ¶
NewDesc creates a pointer to a new Desc struct instance with error checking.
u is a slice of values at which to find quantiles. If nil, a standard set is used. name is the name of the feature (for printing)(
type FCLayer ¶
type FCLayer struct { Size int Bias bool Act Activation ActParm float64 }
FCLayer has details of a fully connected layer
type FParam ¶
type FParam struct { Location float64 `json:"location"` // location parameter for *Cts Scale float64 `json:"scale"` // scale parameter for *Cts Default any `json:"default"` // default level for *Dscrt Lvl Levels `json:"lvl"` // map of values to int32 category for *Dscrt }
FParam -- field parameters -- is summary data about a field. These values may not be derived from the current data but are applied to the current data.
type FType ¶
type FType struct { Name string Role FRole Cats int EmbCols int Normalized bool From string FP *FParam }
FType represents a single field. It holds key information about the feature: its role, dimensions, summary info.
type FTypes ¶
type FTypes []*FType
func LoadFTypes ¶
LoadFTypes loads a file created by the FTypes Save method
func (FTypes) DropFields ¶
DropFields will drop fields from the FTypes
type Fit ¶
type Fit struct {
// contains filtered or unexported fields
}
Fit struct for fitting a NNModel
func (*Fit) BestEpoch ¶
BestEpoch returns the epoch of the best cost (validation or in-sample--whichever is specified)
func (*Fit) Do ¶
Do is the fitting loop. Upon completion ft.nn will have the best model.
Example ¶
Verbose = false bSize := 100 // generate a Pipeline of type *ChData that reads test.csv in the data directory pipe := chPipe(bSize, "test1.csv") // generate model: target and features. Target yoh is one-hot with 2 levels mod := ModSpec{ "Input(x1+x2+x3+x4)", "FC(size:3, activation:relu)", "DropOut(.1)", "FC(size:2, activation:softmax)", "Target(yoh)", } // model is straight-forward with no hidden layers or dropouts. nn, e := NewNNModel(mod, pipe, true, WithCostFn(CrossEntropy)) if e != nil { panic(e) } epochs := 150 ft := NewFit(nn, epochs, pipe) e = ft.Do() if e != nil { panic(e) } // Plot the in-sample cost in a browser (default: firefox) e = ft.InCosts().Plot(&utilities.PlotDef{Title: "In-Sample Cost Curve", Height: 1200, Width: 1200, Show: true, XTitle: "epoch", YTitle: "Cost"}, true) if e != nil { panic(e) }
Output:
Example (Example2) ¶
// This example demonstrates how to use a validation sample for early stopping Verbose = false bSize := 100 // generate a Pipeline of type *ChData that reads test.csv in the data directory mPipe := chPipe(bSize, "test1.csv") vPipe := chPipe(1000, "testVal.csv") // generate model: target and features. Target yoh is one-hot with 2 levels mod := ModSpec{ "Input(x1+x2+x3+x4)", "FC(size:3, activation:relu)", "DropOut(.1)", "FC(size:2, activation:softmax)", "Target(yoh)", } nn, e := NewNNModel(mod, mPipe, true, WithCostFn(CrossEntropy)) if e != nil { panic(e) } epochs := 150 ft := NewFit(nn, epochs, mPipe) WithValidation(vPipe, 10)(ft) e = ft.Do() if e != nil { panic(e) } // Plot the in-sample cost in a browser (default: firefox) e = ft.InCosts().Plot(&utilities.PlotDef{Title: "In-Sample Cost Curve", Height: 1200, Width: 1200, Show: true, XTitle: "epoch", YTitle: "Cost"}, true) if e != nil { panic(e) } e = ft.OutCosts().Plot(&utilities.PlotDef{Title: "Validation Sample Cost Curve", Height: 1200, Width: 1200, Show: true, XTitle: "epoch", YTitle: "Cost"}, true) if e != nil { panic(e) }
Output:
type FitOpts ¶
type FitOpts func(*Fit)
FitOpts functions add options
func WithLearnRate ¶
WithLearnRate sets a learning rate function that declines linearly across the epochs.
func WithOutFile ¶
WithOutFile specifies the file root name to save the best model.
func WithShuffle ¶
WithShuffle shuffles after interval epochs Default is 0 (don't shuffle ever)
func WithValidation ¶
WithValidation adds a validation Pipeline for early stopping. The fit is stopped when the validation cost does not improve for wait epochs.
type FuncSpec ¶ added in v0.2.0
type FuncSpec struct { Name string // The name of the function/operation. Return reflect.Kind // The type of the return. This will either be float64 or any. Args []reflect.Kind // The types of the inputs to the function. Level rune // 'S' if the function is summary-level (1 element) or 'R' if it is row-level. }
FuncSpec stores the details about a function call.
type GData ¶
type GData struct {
// contains filtered or unexported fields
}
func (*GData) AddRaw ¶ added in v0.2.9
AddRaw adds a number of fields in []any format to *GData. The fts are only used to determine the Role.
func (*GData) AppendField ¶ added in v0.0.29
AppendField adds a field to gd
func (*GData) AppendRows ¶ added in v0.2.8
AppendRows appends rows to the existing GData and then re-initializes each GDatum, using the fTypes, if provided.
func (*GData) AppendRowsRaw ¶ added in v0.2.8
AppendRowsRaw simply appends rows, in place, to the existing GData. Only the *Raw data is updated. The .Data field is set to nil.
func (*GData) CountLines ¶ added in v0.0.27
func (*GData) FieldCount ¶
FieldCount returns the number of fields in GData
func (*GData) GetFTypes ¶ added in v0.2.9
GetFTypes returns a slice of *FType corresponding to GData.data
func (*GData) Join ¶ added in v0.2.9
Join joins two *GData on onField. Both *GData are sorted by onField, though the result may not be in sort order for Outer and Right joins. If a field value is missing, the FType.FParam.Default value is filled in. If that value is nil, the following are used:
- int,float : 0
- string ""
- time.Time: 1/1/1970
The field being joined on must have the same name in both *GData. Fields in the left *GData have priority -- if there are duplicate fields, the field in "right" are omitted.
The resulting *GData has only *Raw fields populated. To populate the .data fields, use ReInit. FROneHot and FREmbed fields are left behind -- they'll need to be recreated after the join.
Example ¶
This example shows how to join two *Gdata structs.
// Build the first GData gdLeft := NewGData() field0 := []any{0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0} if e := gdLeft.AppendC(NewRaw(field0, nil), "field0", false, nil, true); e != nil { panic(e) } field1 := []any{"r", "s", "b", "l", "c", "s", "a"} if e := gdLeft.AppendD(NewRaw(field1, nil), "field1", nil, true); e != nil { panic(e) } field2 := []any{"A", "B", "C", "D", "E", "F", "G"} // value to use for field2 if gdLeft doesn't contribute to an output row fp := &FParam{Default: "XX"} if e := gdLeft.AppendD(NewRaw(field2, nil), "field2", fp, true); e != nil { panic(e) } // Build the second GData gdRight := NewGData() field3 := []any{100.0, 200.0, 300.0, 400.0, 500.0} if e := gdRight.AppendC(NewRaw(field3, nil), "field3", false, nil, true); e != nil { panic(e) } field1 = []any{"a", "b", "c", "k", "a"} if e := gdRight.AppendD(NewRaw(field1, nil), "field1", nil, true); e != nil { panic(e) } // do an outer join on field1 gdJoin, err := gdLeft.Join(gdRight, "field1", Outer) if err != nil { panic(err) } for _, fld := range gdJoin.FieldList() { x, err := gdJoin.GetRaw(fld) if err != nil { panic(err) } fmt.Println(fld) fmt.Println(x.Data) }
Output: field0 [6 6 2 4 3 0 1 5 0] field2 [G G C E D A B F XX] field3 [100 500 200 300 0 0 0 0 400] field1 [a a b c l r s s k]
func (*GData) MakeOneHot ¶
MakeOneHot creates & appends a one hot feature from a discrete feature
func (*GData) ReInit ¶ added in v0.2.8
ReInit re-initializes the Data field from Raw for each GDatum. If ftypes is not nil, these values are used, otherwise the FParam values are re-derived from the data.
func (*GData) Read ¶ added in v0.0.27
func (gd *GData) Read(nTarget int, validate bool) (data []chutils.Row, valid []chutils.Valid, err error)
Read reads row(s) in the format of chutils. Note: valids are all chutils.Valid. Invoking Read for the first time causes it to recreate the raw data of existing fields -- so the memory requirement will go up.
func (*GData) Sort ¶
Sort sorts the GData on field. Calling Sort.Sort directly will cause a panic. Sorting a OneHot or Embedded field sorts on the underlying Categorical field
type GDatum ¶
type GDatum struct { FT *FType // FT stores the details of the field: it's role, # categories, mappings Summary Summary // Summary of the Data (e.g. distribution) Data any // Data. This will be either []float64 (FRCts, FROneHot, FREmbed) or []int32 (FRCat) Raw *Raw }
type JoinType ¶ added in v0.2.9
type JoinType int
JoinType is the method to use in joining two GData structs
type Levels ¶
Levels is a map from underlying values if a discrete tensor to int32 values
func ByPtr ¶
ByPtr returns a mapping of values of data to []int32 for modeling. The values of data are sorted, so the smallest will have a mapped value of 0.
type ModSpec ¶
type ModSpec []string
ModSpec holds layers--each slice element is a layer
func LoadModSpec ¶
LoadModSpec loads a ModSpec from file
func (ModSpec) DropOut ¶
DropOut returns the *DoLayer for layer i, if it is of type DropOut. Returns nil o.w.
func (ModSpec) TargetName ¶ added in v0.0.29
type NNModel ¶
type NNModel struct {
// contains filtered or unexported fields
}
NNModel structure
func LoadNN ¶
LoadNN restores a previously saved NNModel. fileRoot is the root name of the save file. p is the Pipeline with the field specs. if build is true, DropOut layers are included.
func NewNNModel ¶
NewNNModel creates a new NN model. Specs for fields in modSpec are pulled from pipe. if build is true, DropOut layers are included.
func PredictNN ¶
PredictNN reads in a NNModel from a file and populates it with a batch from p. Methods such as FitSlice and ObsSlice are immediately available.
Example ¶
// This example demonstrates fitting a regression model and predicting on new data Verbose = false bSize := 100 // generate a Pipeline of type *ChData that reads test.csv in the data directory mPipe := chPipe(bSize, "test1.csv") vPipe := chPipe(1000, "testVal.csv") // This model is OLS mod := ModSpec{ "Input(x1+x2+x3+x4)", "FC(size:1)", "Target(ycts)", } // model is straight-forward with no hidden layers or dropouts. nn, e := NewNNModel(mod, mPipe, true, WithCostFn(RMS)) if e != nil { panic(e) } epochs := 150 ft := NewFit(nn, epochs, mPipe) e = ft.Do() if e != nil { panic(e) } sf := os.TempDir() + "/nnTest" e = nn.Save(sf) if e != nil { panic(e) } pred, e := PredictNN(sf, vPipe, false) if e != nil { panic(e) } fmt.Printf("out-of-sample correlation: %0.2f\n", stat.Correlation(pred.FitSlice(), pred.ObsSlice(), nil)) _ = os.Remove(sf + "P.nn") if e != nil { panic(e) } _ = os.Remove(sf + "S.nn")
Output: out-of-sample correlation: 0.84
func PredictNNwFts ¶ added in v0.0.11
func PredictNNwFts(fileRoot string, pipe Pipeline, build bool, fts FTypes, opts ...NNOpts) (nn *NNModel, err error)
PredictNNwFts creates a new Pipeline that updates the input pipe to have the FTypes specified by fts. For instance, if one has normalized a continuous input, the normalization factor used in the NN must be the same as its build values. One should save the FTypes from the model build pass them here.
func (*NNModel) OutputCols ¶ added in v0.0.9
OutputCols returns the number of columns in the output
type OpNode ¶ added in v0.1.0
type OpNode struct { Expression string // expression this node implements Raw *Raw // node Value Func *FuncSpec // details of the function required to evaulate this node. Role FRole // FRole to use when adding this node to a Pipeline Neg bool // negate result when populating Value Inputs []*OpNode // Inputs to node calculation // contains filtered or unexported fields }
OpNode is a single node of an expression. The input expression is successively broken into simpler expressions. Each leaf is devoid of expressions--they are only values. Hence, leaves have no Inputs.
operations: If the expression at a node is an operation, it will be broken into two subexpressions. The subexpressions are determined by scanning from the left using the order of precedence (+,-,*,/), respecting parentheses. The two subexpressions create two new nodes in Inputs.
Comparison operations with fields of type FRCat are permitted if the underlying data is type string or date. // Strings and dates are enclosed in a single quote ('). Date formats supported are: CCYYMMDD and MM/DD/CCYY.
Functions: If the expression is a function, each argument is assigned to an Input (in order). Functions have at least one input (argument). Two types of functions are supported: those that operate at the row level and those that operate at the summary level. A row-level function will create a slice that has as many elements as the Pipeline. A summary-level function, such as "mean", will have a single element.
Available row-level functions are:
- exp(<expr>)
- log(<expr>)
- lag(<expr>,<missing>), where <missing> is used for the first element.
- abs(<expr>) absolute value
- if(<test>, <true>, <false>), where the value <yes> is used if <condition> is greater than 0 and <false> o.w.
- row(<expr>) row number in pipeline. Row starts as 0 and is continuous.
- countAfter(<expr>), countBefore(<expr>) is the number of rows after (before) the current row.
- cumeAfter(<expr>), cumeBefore(<expr>,<missing>) is the cumulative sum of <expr> after (before) the current row (included)
- prodAfter(<expr>), prodBefore(<expr>,<missing>) is the cumulative product of <expr> after (before) the current row (included) and <missing> is used for the last (first) element.
- index(<expr>,<index>) returns <expr> in the order of <index>
- cat(<expr>) converts <expr> to a categorical field. Only applicable to continuous fields.
- toDate(<expr>) converts a string field to a date
- toString(<expr>) converts <expr> to string
- toFloatSP(<expr>) converts <expr> to float32
- toFloatDP(<expr>) converts <expr> to float64
- toInt(<expr>) converts <expr> to int. Same as cat().
- dateAdd(<date>,<months>) adds <months> to the date, <date>
- toLastDayOfMonth(<date>) moves the date to the last day of the month
- toFirstDayOfMonth(<date>) moves the date to the first day of the month
- year(<date>) returns the year
- month(<date>) returns the month (1-12)
- day(<date>) returns the day of the month (1-lastDayOfMonth)
- dateDiff(<data1>,<date2>,unit) returns date1-date2 units can be 'hour', 'day', 'month' or 'year'
- substr(<string>,<start>,<length>) substring
- strPos(<string>,<target>) first position of <target> in <string>. -1 if does not occur.
- strCount(<string>,<target>) number of times <target> occurs in <string>
- strLen(<string>) length of string
- trunc(<expr>) truncate to int
The values in <...> can be any expression. The functions prodAfter, prodBefore, cumAfter,cumBefore, countAfter, countBefore do NOT include the current row.
Available summary-level functions are:
- mean(<expr>)
- count(<expr>)
- sum(<expr>)
- max(<expr>)
- min(<expr>)
- sse(<y>,<yhat>) returns the sum of squared error of y-yhat
- mad(<y>,<yhat>) returns the sum of the absolute value of y-yhat
- r2(<y>,<yhat>) returns the r-square of estimating y with yhat
- npv(<discount rate>, <cash flows>). Find the NPV of the cash flows at discount rate. If disount rate is a slice, then the ith month's cashflows are discounted for i months at the ith discount rate.
- irr(<cost>,<cash flows>). Find the IRR of an initial outlay of <cost> (a positive value!), yielding cash flows (The first cash flow gets discounted one period). irr returns 0 if there's no solution.
- print(<expr>,<rows>) print <rows> of the <expr>. If <rows>=0, print entire slice.
- printIf(<expr>,<rows>,<cond>) if condition evaluates to a value > 0, execute print(<expr>,<rows>)
- histogram(<x>,<color>, <normalization>). Creates a histogram. normalization is one of: percent, count, density
- plotLine(<x>,<markerType>, <color>)
- plotXY(<x>,<y>,<markerType>, <color>)
- setPlotDim(<width>,<height>), <width>, <height> are in pixels
- render(<file>,<title>,<x label>,<y label>)
- newPlot()
Comparisons
- ==, !=, >,>=, <, <=
Logical operators are supported:
- && for "and"
- || for "or"
Logical operators resolve to 0 or 1.
type Opts ¶
type Opts func(c Pipeline)
Opts function sets an option to a Pipeline
func WithBatchSize ¶
WithBatchSize sets the batch size for the pipeline
func WithCallBack ¶
WithCallBack sets a callback function.
Example ¶
// This example shows how to create a callback during the fitting phase (fit.Do). // The callback is called at the end of each epoch. The callback below loads a new dataset after // epoch 100. Verbose = false bSize := 100 // generate a Pipeline of type *ChData that reads test.csv in the data directory mPipe := chPipe(bSize, "test1.csv") // This callback function replaces the initial dataset with newData.csv after epoch 2500 cb := func(c Pipeline) { switch d := c.(type) { case *ChData: if d.Epoch(-1) == 100 { dataPath := os.Getenv("data") // path to data directory fileName := dataPath + "/testVal.csv" f, e := os.Open(fileName) if e != nil { panic(e) } rdrx := file.NewReader(fileName, ',', '\n', 0, 0, 1, 0, f, 0) if e := rdrx.Init("", chutils.MergeTree); e != nil { panic(e) } if e := rdrx.TableSpec().Impute(rdrx, 0, .99); e != nil { panic(e) } rows, _ := rdrx.CountLines() fmt.Println("New data at end of epoch ", d.Epoch(-1)) fmt.Println("Number of rows ", rows) WithReader(rdrx)(d) } } } WithCallBack(cb)(mPipe) // This model is OLS mod := ModSpec{ "Input(x1+x2+x3+x4)", "FC(size:1)", "Target(ycts)", } // model is straight-forward with no hidden layers or dropouts. nn, e := NewNNModel(mod, mPipe, true, WithCostFn(RMS)) if e != nil { panic(e) } epochs := 150 ft := NewFit(nn, epochs, mPipe) e = ft.Do() if e != nil { panic(e) }
Output: New data at end of epoch 100 Number of rows 1000
func WithCycle ¶
WithCycle sets the cycle bool. If false, the intent is for the Pipeline to generate a new data set is generated for each epoch.
func WithFtypes ¶
WithFtypes sets the FTypes of the Pipeline. The feature is used to override the default levels.
func WithKeepRaw ¶ added in v0.2.0
WithKeepRaw sets bool whether to keep the *Raw data in the pipeline.
func WithNormalized ¶
WithNormalized sets the features to be normalized.
func WithOneHot ¶
WithOneHot adds a one-hot field "name" based of field "from"
Example ¶
// This example shows a model that incorporates a feature (x4) as one-hot and an embedding Verbose = false bSize := 100 // generate a Pipeline of type *ChData that reads test.csv in the data directory pipe := chPipe(bSize, "test1.csv") // The feature x4 takes on values 0,1,2,...19. chPipe treats this a continuous feature. // Let's override that and re-initialize the pipeline. WithCats("x4")(pipe) WithOneHot("x4oh", "x4")(pipe) if e := pipe.Init(); e != nil { panic(e) } mod := ModSpec{ "Input(x1+x2+x3+x4oh)", "FC(size:2, activation:softmax)", "Target(yoh)", } // fmt.Println("x4 as one-hot") nn, e := NewNNModel(mod, pipe, true) if e != nil { panic(e) } fmt.Println(nn) fmt.Println("x4 as embedding") mod = ModSpec{ "Input(x1+x2+x3+E(x4oh,3))", "FC(size:2, activation:softmax)", "Target(yoh)", } nn, e = NewNNModel(mod, pipe, true) if e != nil { panic(e) } fmt.Println(nn)
Output: x4 as one-hot Inputs Field x1 continuous Field x2 continuous Field x3 continuous Field x4oh one-hot derived from feature x4 length 20 Target Field yoh one-hot derived from feature y length 2 Model Structure Input(x1+x2+x3+x4oh) FC(size:2, activation:softmax) Target(yoh) Batch size: 100 24 FC parameters 0 Embedding parameters x4 as embedding Inputs Field x1 continuous Field x2 continuous Field x3 continuous Field x4oh embedding derived from feature x4 length 20 embedding dimension of 3 Target Field yoh one-hot derived from feature y length 2 Model Structure Input(x1+x2+x3+E(x4oh,3)) FC(size:2, activation:softmax) Target(yoh) Batch size: 100 7 FC parameters 60 Embedding parameters
Example (Example2) ¶
// This example incorporates a drop out layer Verbose = false bSize := 100 // generate a Pipeline of type *ChData that reads test.csv in the data directory pipe := chPipe(bSize, "test1.csv") // generate model: target and features. Target yoh is one-hot with 2 levels mod := ModSpec{ "Input(x1+x2+x3+x4)", "FC(size:3, activation:relu)", "DropOut(.1)", "FC(size:2, activation:softmax)", "Target(yoh)", } nn, e := NewNNModel(mod, pipe, true, WithCostFn(CrossEntropy), WithName("Example With Dropouts")) if e != nil { panic(e) } fmt.Println(nn)
Output: Example With Dropouts Inputs Field x1 continuous Field x2 continuous Field x3 continuous Field x4 continuous Target Field yoh one-hot derived from feature y length 2 Model Structure Input(x1+x2+x3+x4) FC(size:3, activation:relu) DropOut(.1) FC(size:2, activation:softmax) Target(yoh) Cost function: CrossEntropy Batch size: 100 19 FC parameters 0 Embedding parameters
type Pipeline ¶
type Pipeline interface { Init() error // initialize the pipeline Rows() int // # of observations in the pipeline (size of the epoch) Batch(inputs G.Nodes) bool // puts the next batch in the input nodes Epoch(setTo int) int // manage epoch count IsNormalized(field string) bool // true if feature is normalized IsCat(field string) bool // true if feature is one-hot encoded Cols(field string) int // # of columns in the feature IsCts(field string) bool // true if the feature is continuous GetFType(field string) *FType // Get FType for the feature GetFTypes() FTypes // Get Ftypes for pipeline BatchSize() int // batch size FieldList() []string // fields available FieldCount() int // number of fields in the pipeline GData() *GData // return underlying GData Get(field string) *GDatum // return data for field GetKeepRaw() bool // returns whether raw data is kept Join(right Pipeline, onField string, joinType JoinType) (Pipeline, error) // joins two pipelines Slice(sl Slicer) (Pipeline, error) // slice the pipeline Shuffle() // shuffle data Describe(field string, topK int) string // describes a field Subset(rows []int) (newPipe Pipeline, err error) // subsets pipeline to rows Where(field string, equalTo []any) (Pipeline, error) // subset pipeline to where field=equalTo Keep(fields []string) error // keep on fields in the pipeline Drop(field string) error // drop field from the pipeline AppendRows(gd *GData, fTypes FTypes) (Pipeline, error) // appends gd to pipeline AppendRowsRaw(gd *GData) error // appends gd ONLY to *Raw data ReInit(ftypes *FTypes) (Pipeline, error) // reinitialized pipeline from *Raw data }
The Pipeline interface specifies the methods required to be a data Pipeline. The Pipeline is the middleware between the data and the fitting routines.
func AddToPipe ¶ added in v0.1.0
AddToPipe adds the Value slice in rootNode to pipe. The field will have name fieldName. To do this:
- Create the *OpNode tree to evaluate the expression using Expr2Tree
- Populate the values from a Pipeline using Evaluate.
- Add the values to the Pipeline using AddToPipe
Notes:
- AddToPipe can be within a CallBack to populate each new call to the database with the calculated fields.
- You can access the values after Evaluate without adding the field to the Pipeline from the Value element of the root node.
Example ¶
We'll add two fields to the pipeline: the sum=c+D and max=max(c,D)
var ( outPipe Pipeline err error ) Verbose = false // builds a Pipline with two fields: // c = 1,2 // D = 3,-4 pipe := buildPipe([]string{"1,2", "3,-4"}, []string{"f", "f"}) // we'll add two fields to the pipeline: the sum=c+d and max=max(c,d) // start by parsing the expressions. field1 := &OpNode{Expression: "c+D"} if e := Expr2Tree(field1); e != nil { panic(e) } field2 := &OpNode{Expression: "if(c>D,c,D)"} if e := Expr2Tree(field2); e != nil { panic(e) } // field1 and field2 nodes now have the structure of the expressions // evaluate these on pipe if err = Evaluate(field1, pipe); err != nil { panic(err) } if err = Evaluate(field2, pipe); err != nil { panic(err) } // now add them to pipe if outPipe, err = AddToPipe(field1, "sum", pipe); err != nil { panic(err) } if outPipe, err = AddToPipe(field2, "max", outPipe); err != nil { panic(err) } // see what we got field1Val := outPipe.Get("sum") fmt.Println(field1Val.Data.([]float64)) field2Val := outPipe.Get("max") fmt.Println(field2Val.Data.([]float64))
Output: [4 -2] [3 2]
func Append ¶ added in v0.2.5
Append appends pipe2 to the bottom of pipe1. pipe2 must have all the fields of pipe1 but may have extra, which are not in the returned pipe
Example ¶
This example shows how to append one pipeline to another
Verbose = false data := os.Getenv("data") pipe1, e := CSVToPipe(data+"/pipeTest1.csv", nil, false) if e != nil { panic(e) } pipe2, e := CSVToPipe(data+"/pipeTest4.csv", nil, false) if e != nil { panic(e) } pipeOut, e := Append(pipe1, pipe2) if e != nil { panic(e) } fmt.Println("pipe1 rows: ", pipe1.Rows()) fmt.Println("pipe2 rows: ", pipe2.Rows()) fmt.Println("appended pipe rows: ", pipeOut.Rows()) fmt.Println("# of fields: ", len(pipeOut.FieldList())) fmt.Println("Field3: ", pipeOut.Get("Field3").Raw.Data)
Output: pipe1 rows: 7 pipe2 rows: 2 appended pipe rows: 9 # of fields: 3 Field3: [3 2.2 1.9 10.1 12.99 100 1001.4 -1 -2]
func CSVToPipe ¶ added in v0.1.6
CSVToPipe creates a pipe from a CSV file Optional fts specifies the FTypes, usually to match an existing pipeline.
Example ¶
Create a Pipeline from a CSV and force a specific FType. The values of the field "row" are integers: 1,2,3,4,5,6,7 If we just load the CSV, row will be treated as float64 (continuous). The field ft instructs the code to treat it as categorical.
Verbose = false // row takes on values 1,2,3,... If we do nothing, the pipe will convert these to float64. // Specifying the role as FRCat will cause "row" to be treated as categorical. ft := &FType{ Name: "row", Role: FRCat, Cats: 0, EmbCols: 0, Normalized: false, From: "", FP: nil, } data := os.Getenv("data") + "/pipeTest1.csv" pipe, e := CSVToPipe(data, FTypes{ft}, false) if e != nil { panic(e) } fmt.Println("# Rows: ", pipe.Rows()) mapped := pipe.Get("row").Data.([]int32) fmt.Println(mapped) // categorical values are mapped to int32. fmt.Println("\nmap for field row:") rowMap := pipe.GetFType("row").FP.Lvl // the raw values in pipeTest1.csv run from 1 to 7 for raw := int64(1); raw < int64(len(mapped))+1; raw++ { fmt.Printf("raw: %v, mapped: %v\n", raw, rowMap[any(raw)]) }
Output: # Rows: 7 [0 1 2 3 4 5 6] map for field row: raw: 1, mapped: 0 raw: 2, mapped: 1 raw: 3, mapped: 2 raw: 4, mapped: 3 raw: 5, mapped: 4 raw: 6, mapped: 5 raw: 7, mapped: 6
type Raw ¶
Raw holds a raw slice of type Kind
func NewRaw ¶
NewRaw creates a new raw slice from x. This assumes all elements of x are the same Kind
func NewRawCast ¶
func (*Raw) CumeAfter ¶ added in v0.2.0
CumeAfter cumulates the data after the current row, for each row.
AggType can take on the following values: - "sum" Cumulative sums are taken. - "product" Cumulative products are taken. - "count" Counts for rows are taken.
For "sum" and "product", the value "missing" is used for the last row.
func (*Raw) CumeBefore ¶ added in v0.2.0
CumeBefore cumulates the data before the current row, for each row.
AggType can take on the following values: - "sum" Cumulative sums are taken. - "product" Cumulative products are taken. - "count", "row" Counts for rows are taken.
For "sum" and "product", the value "missing" is used for the first row.
type Slice ¶
type Slice struct {
// contains filtered or unexported fields
}
Slice implements generating Slicer functions for a feature. These are used to slice through the values of a discrete feature. For continuous features, it slices by quartile.
func NewSlice ¶
NewSlice makes a new Slice based on feat in Pipeline pipe. minCnt is the minimum # of obs a slice must have to be used. Restrict is a slice of values to restrict Iter to.
func (*Slice) Iter ¶
Iter iterates through the levels (ranges) of the feature. Returns false when done.
Example ¶
// An example of slicing through the data to generate diagnostics on subsets. // The code here will generate a decile plot for each of the 20 levels of x4. Verbose = false bSize := 100 // generate a Pipeline of type *ChData that reads test.csv in the data directory pipe := chPipe(bSize, "test1.csv") // The feature x4 takes on values 0,1,2,...19. chPipe treats this a continuous feature. // Let's override that and re-initialize the pipeline. WithCats("x4")(pipe) WithOneHot("x4oh", "x4")(pipe) if e := pipe.Init(); e != nil { panic(e) } mod := ModSpec{ "Input(x1+x2+x3+x4oh)", "FC(size:2, activation:softmax)", "Target(yoh)", } nn, e := NewNNModel(mod, pipe, true) if e != nil { panic(e) } WithCostFn(CrossEntropy)(nn) ft := NewFit(nn, 100, pipe) if e = ft.Do(); e != nil { panic(e) } sf := os.TempDir() + "/nnTest" e = nn.Save(sf) if e != nil { panic(e) } WithBatchSize(8500)(pipe) pred, e := PredictNN(sf, pipe, false) if e != nil { panic(e) } if e = AddFitted(pipe, sf, []int{1}, "fit", nil, false, nil); e != nil { panic(e) } _ = os.Remove(sf + "P.nn") _ = os.Remove(sf + "S.nn") s, e := NewSlice("x4", 0, pipe, nil) if e != nil { panic(e) } fit, e := Coalesce(pred.FitSlice(), 2, []int{1}, false, false, nil) if e != nil { panic(e) } desc, e := NewDesc(nil, "Descriptive Statistics") for s.Iter() { slicer := s.MakeSlicer() if e != nil { panic(e) } desc.Populate(fit, true, slicer) fmt.Printf("Slice x4=%v has %d observations\n", s.Value(), desc.N) }
Output: Slice x4=0 has 391 observations Slice x4=1 has 408 observations Slice x4=2 has 436 observations Slice x4=3 has 428 observations Slice x4=4 has 417 observations Slice x4=5 has 472 observations Slice x4=6 has 424 observations Slice x4=7 has 455 observations Slice x4=8 has 431 observations Slice x4=9 has 442 observations Slice x4=10 has 411 observations Slice x4=11 has 413 observations Slice x4=12 has 433 observations Slice x4=13 has 416 observations Slice x4=14 has 434 observations Slice x4=15 has 367 observations Slice x4=16 has 437 observations Slice x4=17 has 433 observations Slice x4=18 has 429 observations Slice x4=19 has 423 observations
func (*Slice) MakeSlicer ¶
MakeSlicer makes a Slicer function for the current value (discrete) or range (continuous) of the feature. Continuous features are sliced at the lower quartile, median and upper quartile, producing 4 slices.
type Slicer ¶
Slicer is an optional function that returns true if the row is to be used in calculations. This is used to subset the diagnostics to specific values.
type Summary ¶
type Summary struct { NRows int // size of the data DistrC *Desc // summary of continuous field DistrD Levels // summary of discrete field }
Summary has descriptive statistics of a field using its current data.
type VecData ¶
type VecData struct {
// contains filtered or unexported fields
}
func (*VecData) AppendRows ¶ added in v0.2.8
AppendRows appends rows to the existing GData and then re-initializes each GDatum, using the fTypes, if provided.
func (*VecData) AppendRowsRaw ¶ added in v0.2.8
AppendRowsRaw simply appends rows, in place, to the existing GData. Only the *Raw data is updated. The .Data field is set to nil.
func (*VecData) Describe ¶
Describe describes a field. If the field has role FRCat, the top k values (by frequency) are returned.
func (*VecData) FieldCount ¶ added in v0.2.7
FieldCount returns the number of fields in the pipeline
func (*VecData) GetKeepRaw ¶ added in v0.2.0
func (*VecData) IsNormalized ¶
IsNormalized returns true if the field is normalized.
func (*VecData) ReInit ¶ added in v0.2.8
ReInit re-initializes the Data field from Raw for each GDatum. If ftypes is not nil, these values are used, otherwise the FParam values are re-derived from the data. A new pipeline is returned.
func (*VecData) SaveFTypes ¶
SaveFTypes saves the FTypes for the Pipeline.