csv

package
v18.0.0-rc0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Oct 15, 2024 License: Apache-2.0, BSD-3-Clause Imports: 21 Imported by: 1

Documentation

Overview

Package csv reads CSV files and presents the extracted data as records, also writes data as record into CSV files

Example
package main

import (
	"bytes"
	"fmt"
	"log"

	"github.com/apache/arrow-go/v18/arrow"
	"github.com/apache/arrow-go/v18/arrow/csv"
)

func main() {
	f := bytes.NewBufferString(`## a simple set of data: int64;float64;string
0;0;str-0
1;1;str-1
2;2;str-2
3;3;str-3
4;4;str-4
5;5;str-5
6;6;str-6
7;7;str-7
8;8;str-8
9;9;str-9
`)

	schema := arrow.NewSchema(
		[]arrow.Field{
			{Name: "i64", Type: arrow.PrimitiveTypes.Int64},
			{Name: "f64", Type: arrow.PrimitiveTypes.Float64},
			{Name: "str", Type: arrow.BinaryTypes.String},
		},
		nil,
	)
	r := csv.NewReader(f, schema, csv.WithComment('#'), csv.WithComma(';'))
	defer r.Release()

	n := 0
	for r.Next() {
		rec := r.Record()
		for i, col := range rec.Columns() {
			fmt.Printf("rec[%d][%q]: %v\n", n, rec.ColumnName(i), col)
		}
		n++
	}

	// check for reader errors indicating issues converting csv values
	// to the arrow schema types
	err := r.Err()
	if err != nil {
		log.Fatal(err)
	}

}
Output:

rec[0]["i64"]: [0]
rec[0]["f64"]: [0]
rec[0]["str"]: ["str-0"]
rec[1]["i64"]: [1]
rec[1]["f64"]: [1]
rec[1]["str"]: ["str-1"]
rec[2]["i64"]: [2]
rec[2]["f64"]: [2]
rec[2]["str"]: ["str-2"]
rec[3]["i64"]: [3]
rec[3]["f64"]: [3]
rec[3]["str"]: ["str-3"]
rec[4]["i64"]: [4]
rec[4]["f64"]: [4]
rec[4]["str"]: ["str-4"]
rec[5]["i64"]: [5]
rec[5]["f64"]: [5]
rec[5]["str"]: ["str-5"]
rec[6]["i64"]: [6]
rec[6]["f64"]: [6]
rec[6]["str"]: ["str-6"]
rec[7]["i64"]: [7]
rec[7]["f64"]: [7]
rec[7]["str"]: ["str-7"]
rec[8]["i64"]: [8]
rec[8]["f64"]: [8]
rec[8]["str"]: ["str-8"]
rec[9]["i64"]: [9]
rec[9]["f64"]: [9]
rec[9]["str"]: ["str-9"]
Example (WithChunk)
package main

import (
	"bytes"
	"fmt"

	"github.com/apache/arrow-go/v18/arrow"
	"github.com/apache/arrow-go/v18/arrow/csv"
)

func main() {
	f := bytes.NewBufferString(`## a simple set of data: int64;float64;string
0;0;str-0
1;1;str-1
2;2;str-2
3;3;str-3
4;4;str-4
5;5;str-5
6;6;str-6
7;7;str-7
8;8;str-8
9;9;str-9
`)

	schema := arrow.NewSchema(
		[]arrow.Field{
			{Name: "i64", Type: arrow.PrimitiveTypes.Int64},
			{Name: "f64", Type: arrow.PrimitiveTypes.Float64},
			{Name: "str", Type: arrow.BinaryTypes.String},
		},
		nil,
	)
	r := csv.NewReader(
		f, schema,
		csv.WithComment('#'), csv.WithComma(';'),
		csv.WithChunk(3),
	)
	defer r.Release()

	n := 0
	for r.Next() {
		rec := r.Record()
		for i, col := range rec.Columns() {
			fmt.Printf("rec[%d][%q]: %v\n", n, rec.ColumnName(i), col)
		}
		n++
	}

}
Output:

rec[0]["i64"]: [0 1 2]
rec[0]["f64"]: [0 1 2]
rec[0]["str"]: ["str-0" "str-1" "str-2"]
rec[1]["i64"]: [3 4 5]
rec[1]["f64"]: [3 4 5]
rec[1]["str"]: ["str-3" "str-4" "str-5"]
rec[2]["i64"]: [6 7 8]
rec[2]["f64"]: [6 7 8]
rec[2]["str"]: ["str-6" "str-7" "str-8"]
rec[3]["i64"]: [9]
rec[3]["f64"]: [9]
rec[3]["str"]: ["str-9"]
Example (Writer)
package main

import (
	"bytes"
	"fmt"
	"log"

	"github.com/apache/arrow-go/v18/arrow"
	"github.com/apache/arrow-go/v18/arrow/array"
	"github.com/apache/arrow-go/v18/arrow/csv"
	"github.com/apache/arrow-go/v18/arrow/memory"
)

func main() {
	f := new(bytes.Buffer)

	pool := memory.NewGoAllocator()
	schema := arrow.NewSchema(
		[]arrow.Field{
			{Name: "i64", Type: arrow.PrimitiveTypes.Int64},
			{Name: "f64", Type: arrow.PrimitiveTypes.Float64},
			{Name: "str", Type: arrow.BinaryTypes.String},
		},
		nil,
	)

	b := array.NewRecordBuilder(pool, schema)
	defer b.Release()

	b.Field(0).(*array.Int64Builder).AppendValues([]int64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, nil)
	b.Field(1).(*array.Float64Builder).AppendValues([]float64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, nil)
	b.Field(2).(*array.StringBuilder).AppendValues([]string{"str-0", "str-1", "str-2", "str-3", "str-4", "str-5", "str-6", "str-7", "str-8", "str-9"}, nil)

	rec := b.NewRecord()
	defer rec.Release()

	w := csv.NewWriter(f, schema, csv.WithComma(';'))
	err := w.Write(rec)
	if err != nil {
		log.Fatal(err)
	}

	err = w.Flush()
	if err != nil {
		log.Fatal(err)
	}

	err = w.Error()
	if err != nil {
		log.Fatal(err)
	}

	r := csv.NewReader(f, schema, csv.WithComment('#'), csv.WithComma(';'))
	defer r.Release()

	n := 0
	for r.Next() {
		rec := r.Record()
		for i, col := range rec.Columns() {
			fmt.Printf("rec[%d][%q]: %v\n", n, rec.ColumnName(i), col)
		}
		n++
	}

	// check for reader errors indicating issues converting csv values
	// to the arrow schema types
	err = r.Err()
	if err != nil {
		log.Fatal(err)
	}

}
Output:

rec[0]["i64"]: [0]
rec[0]["f64"]: [0]
rec[0]["str"]: ["str-0"]
rec[1]["i64"]: [1]
rec[1]["f64"]: [1]
rec[1]["str"]: ["str-1"]
rec[2]["i64"]: [2]
rec[2]["f64"]: [2]
rec[2]["str"]: ["str-2"]
rec[3]["i64"]: [3]
rec[3]["f64"]: [3]
rec[3]["str"]: ["str-3"]
rec[4]["i64"]: [4]
rec[4]["f64"]: [4]
rec[4]["str"]: ["str-4"]
rec[5]["i64"]: [5]
rec[5]["f64"]: [5]
rec[5]["str"]: ["str-5"]
rec[6]["i64"]: [6]
rec[6]["f64"]: [6]
rec[6]["str"]: ["str-6"]
rec[7]["i64"]: [7]
rec[7]["f64"]: [7]
rec[7]["str"]: ["str-7"]
rec[8]["i64"]: [8]
rec[8]["f64"]: [8]
rec[8]["str"]: ["str-8"]
rec[9]["i64"]: [9]
rec[9]["f64"]: [9]
rec[9]["str"]: ["str-9"]

Index

Examples

Constants

This section is empty.

Variables

View Source
var DefaultNullValues = []string{"", "NULL", "null"}

DefaultNullValues is the set of values considered as NULL values by default when Reader is configured to handle NULL values.

View Source
var (
	ErrMismatchFields = errors.New("arrow/csv: number of records mismatch")
)

Functions

This section is empty.

Types

type Option

type Option func(config)

Option configures a CSV reader/writer.

func WithAllocator

func WithAllocator(mem memory.Allocator) Option

WithAllocator specifies the Arrow memory allocator used while building records.

func WithBoolWriter

func WithBoolWriter(fmtr func(bool) string) Option

WithBoolWriter override the default bool formatter with a function that returns a string representation of bool states. i.e. True, False, 1, 0

func WithCRLF

func WithCRLF(useCRLF bool) Option

WithCRLF specifies the line terminator used while writing CSV files. If useCRLF is true, \r\n is used as the line terminator, otherwise \n is used. The default value is false.

func WithChunk

func WithChunk(n int) Option

WithChunk specifies the chunk size used while parsing CSV files.

If n is zero or 1, no chunking will take place and the reader will create one record per row. If n is greater than 1, chunks of n rows will be read. If n is negative, the reader will load the whole CSV file into memory and create one big record with all the rows.

func WithColumnTypes

func WithColumnTypes(types map[string]arrow.DataType) Option

WithColumnTypes allows specifying optional per-column types (disabling type inference on those columns).

Will panic if used in conjunction with an explicit schema.

func WithComma

func WithComma(c rune) Option

WithComma specifies the fields separation character used while parsing CSV files.

func WithComment

func WithComment(c rune) Option

WithComment specifies the comment character used while parsing CSV files.

func WithHeader

func WithHeader(useHeader bool) Option

WithHeader enables or disables CSV-header handling.

func WithIncludeColumns

func WithIncludeColumns(cols []string) Option

WithIncludeColumns indicates the names of the columns from the CSV file that should actually be read and converted (in the slice's order). If set and non-empty, columns not in this slice will be ignored.

Will panic if used in conjunction with an explicit schema.

func WithLazyQuotes

func WithLazyQuotes(useLazyQuotes bool) Option

WithLazyQuotes sets csv parsing option to LazyQuotes

func WithNullReader

func WithNullReader(stringsCanBeNull bool, nullValues ...string) Option

WithNullReader sets options for a CSV Reader pertaining to NULL value handling. If stringsCanBeNull is true, then a string that matches one of the nullValues set will be interpreted as NULL. Numeric columns will be checked for nulls in all cases. If no nullValues arguments are passed in, the defaults set in NewReader() will be kept.

When no NULL values is given, the default set is taken from DefaultNullValues.

func WithNullWriter

func WithNullWriter(null string) Option

WithNullWriter sets the null string written for NULL values. The default is set in NewWriter().

func WithStringsReplacer

func WithStringsReplacer(replacer *strings.Replacer) Option

WithStringsReplacer receives a replacer to be applied in the string fields of the CSV. This is useful to remove unwanted characters from the string.

type Reader

type Reader struct {
	// contains filtered or unexported fields
}

Reader wraps encoding/csv.Reader and creates array.Records from a schema.

func NewInferringReader

func NewInferringReader(r io.Reader, opts ...Option) *Reader

NewInferringReader creates a CSV reader that attempts to infer the types and column names from the data in the first row of the CSV file.

This can be further customized using the WithColumnTypes and WithIncludeColumns options. For BinaryType the reader will use base64 decoding with padding as per base64.StdDecoding.

func NewReader

func NewReader(r io.Reader, schema *arrow.Schema, opts ...Option) *Reader

NewReader returns a reader that reads from the CSV file and creates arrow.Records from the given schema.

NewReader panics if the given schema contains fields that have types that are not primitive types.

func (*Reader) Err

func (r *Reader) Err() error

Err returns the last error encountered during the iteration over the underlying CSV file.

func (*Reader) Next

func (r *Reader) Next() bool

Next returns whether a Record could be extracted from the underlying CSV file.

Next panics if the number of records extracted from a CSV row does not match the number of fields of the associated schema. If a parse failure occurs, Next will return true and the Record will contain nulls where failures occurred. Subsequent calls to Next will return false - The user should check Err() after each call to Next to check if an error took place.

func (*Reader) Record

func (r *Reader) Record() arrow.Record

Record returns the current record that has been extracted from the underlying CSV file. It is valid until the next call to Next.

func (*Reader) Release

func (r *Reader) Release()

Release decreases the reference count by 1. When the reference count goes to zero, the memory is freed. Release may be called simultaneously from multiple goroutines.

func (*Reader) Retain

func (r *Reader) Retain()

Retain increases the reference count by 1. Retain may be called simultaneously from multiple goroutines.

func (*Reader) Schema

func (r *Reader) Schema() *arrow.Schema

type Writer

type Writer struct {
	// contains filtered or unexported fields
}

Writer wraps encoding/csv.Writer and writes arrow.Record based on a schema.

func NewWriter

func NewWriter(w io.Writer, schema *arrow.Schema, opts ...Option) *Writer

NewWriter returns a writer that writes arrow.Records to the CSV file with the given schema.

NewWriter panics if the given schema contains fields that have types that are not primitive types. For BinaryType the writer will use base64 encoding with padding as per base64.StdEncoding.

func (*Writer) Error

func (w *Writer) Error() error

Error reports any error that has occurred during a previous Write or Flush.

func (*Writer) Flush

func (w *Writer) Flush() error

Flush writes any buffered data to the underlying csv Writer. If an error occurred during the Flush, return it

func (*Writer) Schema

func (w *Writer) Schema() *arrow.Schema

func (*Writer) Write

func (w *Writer) Write(record arrow.Record) error

Write writes a single Record as one row to the CSV file

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL