file

package
v1.1.35 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 26, 2024 License: Apache-2.0 Imports: 8 Imported by: 7

Documentation

Overview

Package file implements Input/Output for text files. Text files can

  • Be delimited or fixed-width
  • Have header rows or not

Ctrl-R values in the files are ignored.

Index

Examples

Constants

This section is empty.

Variables

This section is empty.

Functions

func Rdrs

func Rdrs(rdr0 *Reader, nRdrs int) (r []chutils.Input, err error)

Rdrs generates slices of Readers of len nRdrs. The data represented by rdr0 is equally divided amongst the Readers in the slice.

func Wrtrs

func Wrtrs(tmpDir string, nWrtr int, con *chutils.Connect, separator, eol, quote rune, table string) (wrtrs []chutils.Output, err error)

Wrtrs creates a slice of Writers suitable for chutils.Concur. The file names are chosen randomly.

Types

type Reader

type Reader struct {
	Skip     int // Skip is the # of rows to skip in the file
	RowsRead int // RowsRead is current count of rows read from the file (includes header)
	MaxRead  int // MaxRead is the maximum number of rows to read

	Width int  // Width is the line width for flat files
	Quote rune // Quote is the optional quote around strings that contain the Separator
	// contains filtered or unexported fields
}

Reader implements chutils.Input interface.

func NewReader

func NewReader(filename string, separator, eol, quote rune, width int, skip int, maxRead int,
	rws io.ReadSeekCloser, bufSize int) *Reader

NewReader initializes an instance of Reader

func (*Reader) Close

func (rdr *Reader) Close() error

Close closes the underlying ReadWriteSeeker

func (*Reader) CountLines

func (rdr *Reader) CountLines() (numLines int, err error)

CountLines returns the number of rows in the source data. This does not include any header rows.

func (*Reader) EOL

func (rdr *Reader) EOL() rune

EOL returns end-of-line rune

func (*Reader) Init

func (rdr *Reader) Init(key string, engine chutils.EngineType) error

Init initialize FieldDefs slice Reader.TableSpec() from header row of input. It does not set any of the field types. if key is empty, it defaults to the first field.

func (*Reader) Name

func (rdr *Reader) Name() string

Name returns the name of the file being read

func (*Reader) Read

func (rdr *Reader) Read(nTarget int, validate bool) (data []chutils.Row, valid []chutils.Valid, err error)

Read reads nTarget rows. If nTarget == 0, the entire file is read.

If validation == true:

  • The data is validated according to the rules in rdr.TableSpec.
  • The results are returned as the slice valid.
  • data is returned with the fields appropriately typed.

If validation == false:

  • The data is not validated.
  • The return slice valid is nil
  • The fields are returned as strings.

err returns io.EOF at end of file

Example (CSV)

Loading a CSV, cleaning values and loading into ClickHouse using package file reader and writer

/*

	If you haven't created the table first, you'll get this error simply importing the file via clickhouse-client

	Code: 60. DB::Exception: Received from 127.0.0.1:9000. DB::Exception: Table testing.values doesn't exist. (UNKNOWN_TABLE)

	Once the table exists, the clickhouse-client approach produces this error:

	Row 3:
	Column 0,   name: id,    type: String,         parsed text: "1B23"
	Column 1,   name: zip,   type: FixedString(5), parsed text: "77810"
	Column 2,   name: value, type: Float64,        parsed text: "NA"ERROR
	Code: 27. DB::Exception: Cannot parse NaN. (CANNOT_PARSE_INPUT_ASSERTION_FAILED) (version 22.4.5.9 (official build))

	/home/test/data/zip_data.csv:
	id,zip,value
	1A34,90210,20.8
	1X88,43210,19.2
	1B23,77810,NA
	1r99,94043,100.4
	1x09,hello,9.9
*/

const inFile = "/home/will/tmp/zip_data.csv" // source data
const table = "testing.values"               // ClickHouse destination table
tmpFile := os.TempDir() + "/tmp.csv"         // temp file to write data to for import
var con *chutils.Connect
con, err := chutils.NewConnect("127.0.0.1", "tester", "testGoNow", clickhouse.Settings{})
if err != nil {
	panic(err)
}
defer func() {
	_ = con.Close()
}()
f, err := os.Open(inFile)
if err != nil {
	panic(err)
}
rdr := NewReader(inFile, ',', '\n', '"', 0, 1, 0, f, 50000)
defer func() {
	_ = rdr.Close()
}()
if e := rdr.Init("id", chutils.MergeTree); e != nil {
	panic(err)
}
if e := rdr.TableSpec().Impute(rdr, 0, .95); e != nil {
	panic(e)
}
// Check the internal consistency of TableSpec
if e := rdr.TableSpec().Check(); e != nil {
	panic(e)
}

// Specify zip as FixedString(5) with a missing value of 00000
_, fd, err := rdr.TableSpec().Get("zip")
if err != nil {
	panic(err)
}
// zip will impute to int if we don't make this change
fd.ChSpec.Base = chutils.ChFixedString
fd.ChSpec.Length = 5
fd.Missing = "00000"
legal := []string{"90210", "43210", "77810", "94043"}
fd.Legal.Levels = legal

// Specify value as having a range of [0,30] with a missing value of -1.0
_, fd, err = rdr.TableSpec().Get("value")
if err != nil {
	panic(err)
}
fd.Legal.HighLimit = 30.0
fd.Legal.LowLimit = 0.0
fd.Missing = -1.0

rdr.TableSpec().Engine = chutils.MergeTree
rdr.TableSpec().Key = "id"
if e := rdr.TableSpec().Create(con, table); e != nil {
	panic(e)
}

fx, err := os.Create(tmpFile)
if err != nil {
	panic(err)
}
defer func() {
	_ = fx.Close()
}()
defer func() {
	_ = os.Remove(tmpFile)
}()
wrtr := NewWriter(fx, tmpFile, con, '|', '\n', 0, table)
if e := chutils.Export(rdr, wrtr, 0, false); e != nil {
	panic(e)
}
qry := fmt.Sprintf("SELECT * FROM %s", table)
res, err := con.Query(qry)
if err != nil {
	panic(err)
}
defer func() {
	_ = res.Close()
}()
for res.Next() {
	var (
		id    string
		zip   string
		value float64
	)
	if res.Scan(&id, &zip, &value) != nil {
		panic(err)
	}
	fmt.Println(id, zip, value)
}
Output:

1A34 90210 20.8
1B23 77810 -1
1X88 43210 19.2
1r99 94043 -1
1x09 00000 9.9

func (*Reader) Reset

func (rdr *Reader) Reset() error

Reset sets the file pointer to the start of the file

func (*Reader) Seek

func (rdr *Reader) Seek(lineNo int) error

Seek points the reader to lineNo line in the source data.

func (*Reader) Separator

func (rdr *Reader) Separator() rune

Separator returns field separator rune

func (*Reader) SetTableSpec

func (rdr *Reader) SetTableSpec(ts *chutils.TableDef)

SetTableSpec sets Reader.tablespec. Needed if tablespec is not created by Reader.TableSpec().Impute().

func (*Reader) TableSpec

func (rdr *Reader) TableSpec() *chutils.TableDef

TableSpec returns the TableDef

type Writer

type Writer struct {
	io.WriteCloser
	Table string // Table is the ClickHouse table to Insert to
	// contains filtered or unexported fields
}

Writer implements chutils.Output. Writer will accept any type that satisfies WriterCloser. Typically, this would be a file.

func NewWriter

func NewWriter(f io.WriteCloser, name string, con *chutils.Connect, separator, eol, quote rune, table string) *Writer

NewWriter creates a new Writer instance

func (*Writer) EOL

func (wtr *Writer) EOL() rune

EOL returns the end-of-line rune

func (*Writer) Insert

func (wtr *Writer) Insert() error

Insert inserts the file Writer.Name into ClickHouse table Writer.Table via the clickhouse-client program.

func (*Writer) Name

func (wtr *Writer) Name() string

Name returns the name of the file Writer points to.

func (*Writer) Separator

func (wtr *Writer) Separator() rune

Separator returns the field separator rune

func (*Writer) Text added in v1.1.2

func (wtr *Writer) Text() string

Text returns the string delimiter

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL