Documentation ¶
Overview ¶
Package lib contains basic library routines for Miller.
Index ¶
- Constants
- func BoolToInt(b bool) int64
- func BooleanXOR(a, b bool) bool
- func CompileMillerRegex(regexString string) (*regexp.Regexp, error)
- func CompileMillerRegexOrDie(regexString string) *regexp.Regexp
- func CompileMillerRegexesOrDie(regexStrings []string) []*regexp.Regexp
- func CopyStringArray(input []string) []string
- func EpochNanosecondsToGMT(epochNanoseconds int64) time.Time
- func EpochNanosecondsToLocalTime(epochNanoseconds int64) time.Time
- func EpochNanosecondsToLocationTime(epochNanoseconds int64, location *time.Location) time.Time
- func EpochSecondsToGMT(epochSeconds float64) time.Time
- func EpochSecondsToLocalTime(epochSeconds float64) time.Time
- func EpochSecondsToLocationTime(epochSeconds float64, location *time.Location) time.Time
- func FormatAsParagraph(text string, maxWidth int) []string
- func GetArrayKeysSorted(input map[string]string) []string
- func GetCov(nint int64, sumx float64, sumy float64, sumxy float64) float64
- func GetCovMatrix(nint int64, sumx float64, sumx2 float64, sumy float64, sumy2 float64, ...) (Q [2][2]float64)
- func GetKurtosis(nint int, sumx float64, sumx2 float64, sumx3 float64, sumx4 float64) float64
- func GetLinearRegressionOLS(nint int64, sumx float64, sumx2 float64, sumxy float64, sumy float64) (m, b float64)
- func GetLinearRegressionPCA(eigenvalue_1 float64, eigenvalue_2 float64, eigenvector_1 [2]float64, ...) (m, b, quality float64)
- func GetRealSymmetricEigensystem(matrix [2][2]float64) (eigenvalue1 float64, eigenvalue2 float64, eigenvector1 [2]float64, ...)
- func GetSkewness(nint int, sumx float64, sumx2 float64, sumx3 float64) float64
- func GetVar(nint int64, sumx float64, sumx2 float64) float64
- func Getoptify(inargs []string) []string
- func IntMin2(a, b int64) int64
- func InternalCodingErrorIf(condition bool)
- func InternalCodingErrorPanic(message string)
- func InternalCodingErrorWithMessageIf(condition bool, message string)
- func InterpolateCaptures(replacementString string, replacementMatrix [][]int, captures []string) string
- func Invqnorm(x float64) float64
- func IsEOF(err error) bool
- func IsUpdateableInPlace(filename string, prepipe string) error
- func LoadStringFromFile(filename string) (string, error)
- func LoadStringsFromDir(dirname string, extension string) ([]string, error)
- func LoadStringsFromFileOrDir(path string, extension string) ([]string, error)
- func LogisticRegression(xs, ys []float64) (m, b float64)
- func MakeEmptyCaptures() []string
- func Nsec2GMT(epochNanoseconds int64, numDecimalPlaces int) string
- func Nsec2LocalTime(epochNanoseconds int64, numDecimalPlaces int) string
- func Nsec2LocationTime(epochNanoseconds int64, numDecimalPlaces int, location *time.Location) string
- func OpenFileForRead(filename string, prepipe string, prepipeIsRaw bool, ...) (io.ReadCloser, error)
- func OpenInboundHalfPipe(commandString string) (*os.File, error)
- func OpenOutboundHalfPipe(commandString string) (*os.File, error)
- func OpenStdin(prepipe string, prepipeIsRaw bool, encoding TFileInputEncoding) (io.ReadCloser, error)
- func PathToHandle(path string) (io.ReadCloser, error)
- func Plural(n int) string
- func PrintWordsAsParagraph(words []string)
- func Qnorm(x float64) float64
- func RandFloat64() float64
- func RandInt63() int64
- func RandRange(lowInclusive, highExclusive int64) int64
- func RandUint32() uint32
- func ReadCSVHeader(filename string) ([]string, error)
- func RegexCompiledMatchSimple(input string, regex *regexp.Regexp) bool
- func RegexCompiledMatchWithCaptures(input string, regex *regexp.Regexp) (bool, []string)
- func RegexCompiledMatchWithMapResults(input string, regex *regexp.Regexp) (bool, []string, []int, []int)
- func RegexCompiledSplitString(regex *regexp.Regexp, input string, n int) []string
- func RegexCompiledSub(input string, regex *regexp.Regexp, replacement string, ...) string
- func RegexStringGsub(input string, sregex string, replacement string) string
- func RegexStringMatchSimple(input string, sregex string) bool
- func RegexStringMatchWithCaptures(input string, sregex string) (matches bool, capturesOneUp []string)
- func RegexStringMatchWithMapResults(input string, sregex string) (matches bool, captures []string, starts []int, ends []int)
- func RegexStringSub(input string, sregex string, replacement string) string
- func ReplacementHasCaptures(replacement string) (hasCaptures bool, matrix [][]int)
- func ReverseStringList(strings []string)
- func Sec2GMT(epochSeconds float64, numDecimalPlaces int) string
- func Sec2LocalTime(epochSeconds float64, numDecimalPlaces int) string
- func Sec2LocationTime(epochSeconds float64, numDecimalPlaces int, location *time.Location) string
- func SeedRandom(seed int64)
- func SetTZFromEnv() error
- func Sgn(a float64) float64
- func SortStrings(strings []string)
- func SortedStrings(strings []string) []string
- func SplitString(input string, separator string) []string
- func StringListToSet(stringList []string) map[string]bool
- func StripEmpties(input []string) []string
- func TSVDecodeField(input string) string
- func TSVEncodeField(input string) string
- func TryBoolFromBoolString(input string) (bool, bool)
- func TryFloatFromString(input string) (float64, bool)
- func TryIntFromString(input string) (int64, bool)
- func TryIntFromStringWithBase(input string, base int64) (int64, bool)
- func TryLatin1ToUTF8(input string) (string, error)
- func TryUTF8ToLatin1(input string) (string, error)
- func UTF8Strlen(s string) int64
- func UnbackslashStringLiteral(input string) string
- func UnhexStringLiteral(input string) string
- func WhereAreWe()
- func WrapOutputHandle(fileWriteHandle io.WriteCloser, inputFileEncoding TFileInputEncoding) (io.WriteCloser, bool, error)
- func WriteTempFileOrDie(contents string) string
- type BZip2ReadCloser
- type OrderedMap
- func (omap *OrderedMap) Clear()
- func (omap *OrderedMap) Get(key string) interface{}
- func (omap *OrderedMap) GetKeys() []string
- func (omap *OrderedMap) GetKeysExcept(exceptions map[string]bool) []string
- func (omap *OrderedMap) GetWithCheck(key string) (interface{}, bool)
- func (omap *OrderedMap) Has(key string) bool
- func (omap *OrderedMap) IsEmpty() bool
- func (omap *OrderedMap) Put(key string, value interface{})
- func (omap *OrderedMap) Remove(key string) bool
- type TFileInputEncoding
- type ZstdReadCloser
Constants ¶
const DOC_URL = "https://miller.readthedocs.io"
const INVQNORM_MAXITER int = 30
const INVQNORM_TOL float64 = 1e-9
const JACOBI_MAXITER = 20
const JACOBI_TOLERANCE = 1e-12
Variables ¶
This section is empty.
Functions ¶
func BooleanXOR ¶
func CompileMillerRegex ¶
CompileMillerRegex wraps Go regex-compile with some Miller-specific syntax which predates the port of Miller from C to Go. Miller regexes use a final 'i' to indicate case-insensitivity; Go regexes use an initial "(?i)".
(See also mlr.bnf where we specify which things can be backslash-escaped without a syntax error at parse time.)
* If the regex_string is of the form a.*b, compiles it case-sensitively. * If the regex_string is of the form "a.*b", compiles a.*b case-sensitively. * If the regex_string is of the form "a.*b"i, compiles a.*b case-insensitively.
func CompileMillerRegexOrDie ¶
CompileMillerRegexOrDie wraps CompileMillerRegex. Usually in Go we want to return a second error argument rather than fataling. However, if there's a malformed regex we really cannot continue so it's simpler to just fatal.
func CompileMillerRegexesOrDie ¶
CompileMillerRegexesOrDie is a convenenience looper over CompileMillerRegexOrDie.
func CopyStringArray ¶
func EpochNanosecondsToGMT ¶
func EpochSecondsToGMT ¶
func EpochSecondsToLocalTime ¶
func FormatAsParagraph ¶
For online help contexts like printing all the built-in DSL functions, or the list of all verbs. Max width is nominally 80.
func GetArrayKeysSorted ¶
Go doesn't preserve insertion order in its arrays, so here we make an accessor for getting the keys in sorted order for the benefit of map-printers.
func GetCovMatrix ¶
func GetCovMatrix( nint int64, sumx float64, sumx2 float64, sumy float64, sumy2 float64, sumxy float64, ) (Q [2][2]float64)
----------------------------------------------------------------
func GetKurtosis ¶
func GetLinearRegressionOLS ¶
func GetLinearRegressionPCA ¶
func GetSkewness ¶
GetSkewness is the finalizing function for computing skewness from streamed accumulator values.
func GetVar ¶
GetVar is the finalizing function for computing variance from streamed accumulator values.
func Getoptify ¶
Getoptify expands "-xyz" into "-x -y -z" while leaving "--xyz" intact. This is a keystroke-saver for the user.
This is OK to do here globally since Miller is quite consistent (in main, verbs, auxents, and terminals) that multi-character options start with two dashes, e.g. "--csv". (The sole exception is the sort verb's -nf/-nr which are handled specially there.)
Additionally, we split "--foo=bar" into "--foo" and "bar".
func InternalCodingErrorIf ¶
func InternalCodingErrorIf(condition bool)
InternalCodingErrorIf is a lookalike for C's __FILE__ and __LINE__ printing, with exit 1 if the condition is true.
func InternalCodingErrorPanic ¶
func InternalCodingErrorPanic(message string)
InternalCodingErrorPanic is like InternalCodingErrorIf, expect that it panics the process (for stack trace, which is usually not desired), and that it requires the if-test to be at the caller.
func InternalCodingErrorWithMessageIf ¶
InternalCodingErrorWithMessageIf is a lookalike for C's __FILE__ and __LINE__ printing, with exit 1 if the condition is true.
func InterpolateCaptures ¶
func InterpolateCaptures( replacementString string, replacementMatrix [][]int, captures []string, ) string
InterpolateCaptures example:
* Input $x is "ab_cde"
- DSL expression if ($x =~ "(..)_(...)") { ... other lines of code ... $y = "\2:\1"; }
* InterpolateCaptures is used on the evaluation of "\2:\1"
* replacementString is "\2:\1"
replacementMatrix contains precomputed/cached offsets for the "\2" and "\1" substrings within "\2:\1"
captures has slot 0 being "ab_cde" (for "\0"), slot 1 being "ab" (for "\1"), slot 2 being "cde" (for "\2"), and slots 3-9 being "".
func IsEOF ¶
IsEOF handles the following problem: reading past end of files opened with os.Open returns the error which is io.EOF. Reading past close of pipes opened with popen (e.g. Miller's prepipe, where the file isn't 'foo.dat' but rather the process 'gunzip < foo.dat |') returns not io.EOF but an error with 'file already closed' within it. See also https://stackoverflow.com/questions/47486128/why-does-io-pipe-continue-to-block-even-when-eof-is-reached
func IsUpdateableInPlace ¶
IsUpdateableInPlace tells if we can use the input with mlr -I: not for URLs, and not for prepipe commands (which we don't presume to know how to invert for output).
func LoadStringFromFile ¶
LoadStringFromFile is just a wrapper around os.ReadFile, with a cast from []byte to string.
func LoadStringsFromDir ¶
LoadStringsFromDir loads all file contents for files in the given directory having the given extension. E.g. LoadStringsFromDir("/u/myfiles", ".mlr") will load /u/myfiles/foo.mlr and /u/myfiles/bar.mlr but will skip over /u/myfiles/data.csv and /u/myfiles/todo.txt.
func LoadStringsFromFileOrDir ¶
LoadStringsFromFileOrDir calls LoadStringFromFile if path exists and is a file, or LoadStringsFromDir if path exists and is a directory. In the former case the extension is ignored; in the latter case it's used as a filter on the directory entries.
func LogisticRegression ¶
func MakeEmptyCaptures ¶
func MakeEmptyCaptures() []string
MakeEmptyCaptures is for initial CST state at the start of executing the DSL expression for the current record. Even if '$x =~ "(..)_(...)" set "\1" and "\2" on the previous record, at start of processing for the current record we need to start with a clean slate. This is in support of CST state, which `=~` semantics requires.
func Nsec2LocalTime ¶
func Nsec2LocationTime ¶
func OpenFileForRead ¶
func OpenFileForRead( filename string, prepipe string, prepipeIsRaw bool, encoding TFileInputEncoding, ) (io.ReadCloser, error)
OpenFileForRead: If prepipe is non-empty, popens "{prepipe} < {filename}" and returns a handle to that where prepipe is nominally things like "gunzip", "cat", etc. Otherwise, delegates to an in-process reader which can natively handle gzip/bzip2/zlib depending on the specified encoding. If the encoding isn't a compression encoding, this ends up being simply os.Open.
func OpenStdin ¶
func OpenStdin( prepipe string, prepipeIsRaw bool, encoding TFileInputEncoding, ) (io.ReadCloser, error)
OpenStdin: if prepipe is non-empty, popens "{prepipe}" and returns a handle to that where prepipe is nominally things like "gunzip", "cat", etc. Otherwise, delegates to an in-process reader which can natively handle gzip/bzip2/zlib depending on the specified encoding. If the encoding isn't a compression encoding, this ends up being simply os.Stdin.
func PathToHandle ¶
func PathToHandle( path string, ) (io.ReadCloser, error)
PathToHandle maps various back-ends to a stream. As of 2021-07-07, the following URI schemes are supported: * https://... and http://... * file://... * plain disk files
func PrintWordsAsParagraph ¶
func PrintWordsAsParagraph(words []string)
For online help contexts like printing all the built-in DSL functions, or the list of all verbs.
func Qnorm ¶
Normal cumulative distribution function, expressed in terms of erfc library function (which is awkward, but exists).
func RandFloat64 ¶
func RandFloat64() float64
func RandUint32 ¶
func RandUint32() uint32
func ReadCSVHeader ¶
func RegexCompiledMatchSimple ¶
RegexCompiledMatchSimple is for simple boolean return without any substring captures.
func RegexCompiledMatchWithCaptures ¶
RegexCompiledMatchWithCaptures is the implementation for the =~ operator. Without Miller-style regex captures this would a simple one-line regex.MatchString(input). However, we return the captures array for the benefit of subsequent references to "\0".."\9".
func RegexCompiledMatchWithMapResults ¶
func RegexCompiledMatchWithMapResults( input string, regex *regexp.Regexp, ) (bool, []string, []int, []int)
RegexCompiledMatchWithMapResults does the work for RegexStringMatchWithMapResults once a compiled regexp is available. Array slot 0 is for the full match; slots 1 and up are for the capture-matches such as "\([0-9]+\):\([a-z]+\)".
func RegexCompiledSplitString ¶
In Go as in all languages I'm aware of with a string-split, "a,b,c" splits on "," to ["a", "b", "c" and "a" splits to ["a"], both of which are fine -- but "" splits to [""] when I wish it were []. This function does the latter.
func RegexCompiledSub ¶
func RegexCompiledSub( input string, regex *regexp.Regexp, replacement string, replacementCaptureMatrix [][]int, ) string
RegexCompiledSub is the same as RegexStringSub but with compiled regex and replacement strings.
func RegexStringGsub ¶
RegexStringGsub implements the `gsub` DSL function.
func RegexStringMatchSimple ¶
RegexStringMatchSimple is for simple boolean return without any substring captures.
func RegexStringMatchWithCaptures ¶
func RegexStringMatchWithCaptures( input string, sregex string, ) ( matches bool, capturesOneUp []string, )
RegexStringMatchWithCaptures implements the =~ DSL operator. The captures are stored in DSL state and may be used by a DSL statement after the =~. For example, in
sub($a, "(..)_(...)", "\1:\2")
the replacement string is an argument to sub and therefore the captures are confined to the implementation of the sub function. Similarly for gsub. But for the match operator, people can do
if ($x =~ "(..)_(...)") { ... other lines of code ... $y = "\2:\1" }
and the =~ callsite doesn't know if captures will be used or not. So, RegexStringMatchWithCaptures always returns the captures array. It is stored within the CST state.
func RegexStringMatchWithMapResults ¶
func RegexStringMatchWithMapResults( input string, sregex string, ) ( matches bool, captures []string, starts []int, ends []int, )
RegexStringMatchWithMapResults implements much of the `strmatchx` DSL function. This returns captures via return values. This is distinct from RegexStringMatchWithCaptures which is for the `=~` DSL operator.
func RegexStringSub ¶
RegexStringSub implements the sub DSL function.
func ReplacementHasCaptures ¶
ReplacementHasCaptures is used by the CST builder to see if string-literal is like "foo bar" or "foo \1 bar" -- in the latter case it needs to retain the compiled offsets-matrix information. This is in support of CST state, which `=~` semantics requires.
func ReverseStringList ¶
func ReverseStringList(strings []string)
func Sec2LocalTime ¶
func Sec2LocationTime ¶
func SeedRandom ¶
func SeedRandom(seed int64)
Users can request specific seeds if they want the same random-number sequence on each run.
func SetTZFromEnv ¶
func SetTZFromEnv() error
SetTZFromEnv applies the $TZ environment variable. This has three reasons: (1) On Windows (as of 2021-10-20), this is necessary to get $TZ into use. (2) On Linux/Mac, as of this writing it is not necessary for initial value of TZ at startup. However, an explicit check is helpful since if someone does 'export TZ=Something/Invalid', then runs Miller, and invalid TZ is simply *ignored* -- we want to surface that error to the user. (3) On any platform this is necessary for *changing* TZ mid-process: e.g. if a DSL statement does 'ENV["TZ"] = Asia/Istanbul'.
func SortStrings ¶
func SortStrings(strings []string)
func SortedStrings ¶
func SplitString ¶
In Go as in all languages I'm aware of with a string-split, "a,b,c" splits on "," to ["a", "b", "c" and "a" splits to ["a"], both of which are fine -- but "" splits to [""] when I wish it were []. This function does the latter.
func StringListToSet ¶
func StripEmpties ¶
func TSVDecodeField ¶
TSVDecodeField is for the TSV record-reader.
func TSVEncodeField ¶
TSVEncodeField is for the TSV record-writer.
func TryBoolFromBoolString ¶
func TryFloatFromString ¶
func TryIntFromString ¶
TryIntFromString tries decimal, hex, octal, and binary.
func TryIntFromStringWithBase ¶
TryIntFromStringWithBase allows the user to choose the base that's used, rather than inferring from 0x prefix, etc as TryIntFromString does.
func TryLatin1ToUTF8 ¶
func TryUTF8ToLatin1 ¶
func UTF8Strlen ¶
func UnbackslashStringLiteral ¶
UnbackslashStringLiteral replaces "\t" with TAB, etc. for DSL expressions like '$foo = "a\tb"'. See also https://en.wikipedia.org/wiki/Escape_sequences_in_C (predates the port of Miller from C to Go).
Note that a CST-build pre-pass intentionally excludes regex literals (2nd argument to sub/gsub/regextract/etc) from being modified here.
Note "\0" .. "\9" are used for regex captures within the DSL CST builder and are not touched here. (See also lib/regex.go.)
func UnhexStringLiteral ¶
UnhexStringLiteral is like UnbackslashStringLiteral but only unhexes things like "\x1f". This is for IFS and IPS setup; see the cli package.
func WrapOutputHandle ¶
func WrapOutputHandle( fileWriteHandle io.WriteCloser, inputFileEncoding TFileInputEncoding, ) (io.WriteCloser, bool, error)
WrapOutputHandle wraps a file-write handle with a decompressor. The first return value is the wrapped handle. The second is true if the returned handle needs to be closed separately from the original. The third is for in-process compression we can't undo: namely, as of September 2021 the gzip and zlib libraries support write-closers, but the bzip2 library does not.
func WriteTempFileOrDie ¶
WriteTempFile places the contents string into a temp file, which the caller must remove.
Types ¶
type BZip2ReadCloser ¶
type BZip2ReadCloser struct {
// contains filtered or unexported fields
}
---------------------------------------------------------------- BZip2ReadCloser remedies the fact that bzip2.NewReader does not implement io.ReadCloser.
func NewBZip2ReadCloser ¶
func NewBZip2ReadCloser(handle io.ReadCloser) *BZip2ReadCloser
func (*BZip2ReadCloser) Close ¶
func (rc *BZip2ReadCloser) Close() error
type OrderedMap ¶
type OrderedMap struct { FieldCount int64 Head *orderedMapEntry Tail *orderedMapEntry // contains filtered or unexported fields }
----------------------------------------------------------------
func NewOrderedMap ¶
func NewOrderedMap() *OrderedMap
----------------------------------------------------------------
func (*OrderedMap) Clear ¶
func (omap *OrderedMap) Clear()
----------------------------------------------------------------
func (*OrderedMap) Get ¶
func (omap *OrderedMap) Get(key string) interface{}
----------------------------------------------------------------
func (*OrderedMap) GetKeys ¶
func (omap *OrderedMap) GetKeys() []string
func (*OrderedMap) GetKeysExcept ¶
func (omap *OrderedMap) GetKeysExcept(exceptions map[string]bool) []string
Returns an array of keys, not including the ones specified. The ones specified are to be passed in as a map from string to bool, as Go doesn't have hash-sets.
func (*OrderedMap) GetWithCheck ¶
func (omap *OrderedMap) GetWithCheck(key string) (interface{}, bool)
The Get is sufficient for pointer values -- the caller can check if the return value is nil. For int/string values (which are non-nullable) we have this method.
func (*OrderedMap) Has ¶
func (omap *OrderedMap) Has(key string) bool
func (*OrderedMap) IsEmpty ¶
func (omap *OrderedMap) IsEmpty() bool
----------------------------------------------------------------
func (*OrderedMap) Put ¶
func (omap *OrderedMap) Put(key string, value interface{})
----------------------------------------------------------------
func (*OrderedMap) Remove ¶
func (omap *OrderedMap) Remove(key string) bool
---------------------------------------------------------------- Returns true if it was found and removed
type TFileInputEncoding ¶
type TFileInputEncoding int
const ( FileInputEncodingDefault TFileInputEncoding = iota FileInputEncodingBzip2 FileInputEncodingGzip FileInputEncodingZlib FileInputEncodingZstd )
func FindInputEncoding ¶
func FindInputEncoding( filename string, inputFileInputEncoding TFileInputEncoding, ) TFileInputEncoding
FindInputEncoding determines the input encoding (compression), whether from a flag like --gzin, or from filename suffix like ".gz". If the user did --gzin on the command line, TFileInputEncoding will be FileInputEncodingGzip. If they didn't, but the filename ends in ".gz", then we auto-infer FileInputEncodingGzip. Either way, this function tells if we will be using in-process decompression within the file-format-specific record reader.
type ZstdReadCloser ¶
type ZstdReadCloser struct {
// contains filtered or unexported fields
}
---------------------------------------------------------------- ZstdReadCloser remedies the fact that zstd.NewReader does not implement io.ReadCloser.
func NewZstdReadCloser ¶
func NewZstdReadCloser(handle io.ReadCloser) (*ZstdReadCloser, error)
func (*ZstdReadCloser) Close ¶
func (rc *ZstdReadCloser) Close() error