fastx

package
v0.12.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 3, 2024 License: MIT Imports: 8 Imported by: 25

README

fastx

Go Reference

This package seamlessly parses both FASTA and FASTQ formats.

Examples

Common operation
package main

import (
	"fmt"
	"io"
	"os"

	// "github.com/shenwei356/bio/seq"
	"github.com/shenwei356/bio/seqio/fastx"
	"github.com/shenwei356/xopen"
)

func main() {
	// use buffered out stream for output
	outfh, err := xopen.Wopen("-") // "-" for STDOUT
	checkError(err)
	defer outfh.Close()

	// disable sequence validation could reduce time when reading large sequences
	// seq.ValidateSeq = false

	reader, err := fastx.NewDefaultReader("-")
	checkError(err)

	var record *fastx.Record
	for {
		record, err = reader.Read()
		if err != nil {
			if err == io.EOF {
				break
			}
			checkError(err)
			break
		}

		// fmt is slow for output, because it's not buffered
		// fmt.Printf("%s", record.Format(0))

		record.FormatToWriter(outfh, 0)
	}
	reader.Close() // Please remember to call this !!!
}

func checkError(err error) {
	if err != nil {
		fmt.Fprintln(os.Stderr, err)
		os.Exit(1)
	}
}

Note that, similar with bytes.Buffer.Bytes() method, the current record will change after your another call of this method. You may use record.Clone() to make a copy.

Asynchronously parsing

ChunkChan asynchronously reads FASTA/Q records, and returns a channel of Record Chunk, from which you can easily access the records. bufferSize is the number of buffered chunks, and chunkSize is the size of records in a chunk.

reader, err := fastx.NewDefaultReader(file)
checkError(err)

for chunk := range reader.ChunkChan(bufferSize, chunkSize) {
    checkError(chunk.Err)

    for _, record := range chunk.Data {
        fmt.Print(record)
    }
}

Note that, these's no need to clone the record by record.Clone() here.

Custom alphabet and identifier regular expression
import (
    "github.com/shenwei356/bio/seq"
    "github.com/shenwei356/bio/seqio/fastx"
)

reader, err := fastx.NewReader(seq.DNA, file, "^([^\s]+)\s?")

Documentation

Overview

Package fastx seamlessly parses FASTA and FASTQ format file This package seamlessly parses both FASTA and FASTQ formats.

## Examples

### Common operation

package main

import (
	"fmt"
	"io"
	"os"

	// "github.com/shenwei356/bio/seq"
	"github.com/shenwei356/bio/seqio/fastx"
	"github.com/shenwei356/xopen"
)

func main() {
	// use buffered out stream for output
	outfh, err := xopen.Wopen("-") // "-" for STDOUT
	checkError(err)
	defer outfh.Close()

	// disable sequence validation could reduce time when reading large sequences
	// seq.ValidateSeq = false

	reader, err := fastx.NewDefaultReader("-")
	checkError(err)
	for {
		record, err := reader.Read()
		if err != nil {
			if err == io.EOF {
				break
			}
			checkError(err)
			break
		}

		// fmt is slow for output, because it's not buffered
		// fmt.Printf("%s", record.Format(0))

		record.FormatToWriter(outfh, 0)
	}
}

func checkError(err error) {
	if err != nil {
		fmt.Fprintln(os.Stderr, err)
		os.Exit(1)
	}
}

***Note that***, similar with `bytes.Buffer.Bytes()` method, the current record will change after your another call of this method. You may use `record.Clone()` to make a copy.

### Asynchronously parsing

`ChunkChan` asynchronously reads FASTA/Q records, and returns a channel of Record Chunk, from which you can easily access the records. `bufferSize` is the number of buffered chunks, and `chunkSize` is the size of records in a chunk.

reader, err := fastx.NewDefaultReader(file)
checkError(err)

for chunk := range reader.ChunkChan(bufferSize, chunkSize) {
    checkError(chunk.Err)

    for _, record := range chunk.Data {
        fmt.Print(record)
    }
}

***Note that***, these's no need to clone the record by `record.Clone()` here.

### Custom alphabet and identifier regular expression

import (
    "github.com/shenwei356/bio/seq"
    "github.com/shenwei356/bio/seqio/fastx"
)

reader, err := fastx.NewReader(seq.DNA, file, "^([^\s]+)\s?")

Index

Constants

This section is empty.

Variables

View Source
var DefaultIDRegexp = `^(\S+)\s?`

DefaultIDRegexp is the default ID parsing regular expression

View Source
var ErrBadFASTQFormat = errors.New("fastx: bad fastq format")

ErrBadFASTQFormat means bad fastq format

View Source
var ErrNoContent = errors.New("fastx: no content found")

ErrNoContent means nothing in the file or stream

View Source
var ErrNotFASTXFormat = errors.New("fastx: invalid FASTA/Q format")

ErrNotFASTXFormat means that the file is not FASTA/Q

View Source
var ErrUnequalSeqAndQual = errors.New("fastx: unequal sequence and quality")

ErrUnequalSeqAndQual means unequal sequence and quality

View Source
var ForcelyOutputFastq bool

ForcelyOutputFastq means outputing record as fastq even if it has no quality (zero-length fastq)

Functions

func GetSeqNames

func GetSeqNames(file string) ([]string, error)

GetSeqNames returns the names of a fasta/q file

func GetSeqNumber

func GetSeqNumber(file string) (int, error)

GetSeqNumber returns the sequences number of FASTA/Q files

func GetSeqsMap

func GetSeqsMap(file string, alphabet *seq.Alphabet, bufferSize int, chunkSize int, idRegexp string) (map[string]*Record, error)

GetSeqsMap returns all seqs as a map for fasta file

func GuessAlphabet

func GuessAlphabet(file string) (*seq.Alphabet, bool, error)

GuessAlphabet guess the alphabet of the file by the first maxLen bases

func ParseHeadID

func ParseHeadID(idRegexp *regexp.Regexp, head []byte) []byte

ParseHeadID parse ID from head by IDRegexp. not used.

Types

type Reader

type Reader struct {
	IsFastq bool // if the file is fastq format

	IDRegexp *regexp.Regexp // regexp for parsing record id

	// only for compatibility of empty files
	Err error
	// contains filtered or unexported fields
}

Reader seamlessly parse both FASTA and FASTQ formats

func NewDefaultReader

func NewDefaultReader(file string) (*Reader, error)

NewDefaultReader automaticlly recognizes sequence type and parses id with default manner

func NewReader

func NewReader(t *seq.Alphabet, file string, idRegexp string) (*Reader, error)

NewReader is constructor of FASTX Reader.

Parameters:

t            sequence alphabet
             if nil is given, it will guess alphabet by the first record
file         file name, "-" for stdin
idRegexp     id parsing regular expression string, must contains "(" and ")" to capture matched ID
             "" for default value: `^([^\s]+)\s?`
             if record head does not match the idRegxp, whole name will be the id

Please call reader.Close() afer using the records!!!

func NewReaderFromIO

func NewReaderFromIO(t *seq.Alphabet, ioReader io.Reader, idRegexp string) (*Reader, error)

NewReaderFromIO is constructor of FASTX Reader.

Parameters:

t            sequence alphabet
             if nil is given, it will guess alphabet by the first record
file         an io.Reader
idRegexp     id parsing regular expression string, must contains "(" and ")" to capture matched ID
             "" for default value: `^([^\s]+)\s?`
             if record head does not match the idRegxp, whole name will be the id

Please call reader.Close() afer using the records!!!

func (*Reader) Alphabet

func (fastxReader *Reader) Alphabet() *seq.Alphabet

Alphabet returns Alphabet of the file

func (*Reader) ChunkChan

func (fastxReader *Reader) ChunkChan(bufferSize int, chunkSize int) chan RecordChunk

ChunkChan asynchronously reads FASTA/Q records, and returns a channel of Record Chunk, from which you can easily access the records. bufferSize is the number of buffered chunks, and chunkSize is the size of records in a chunk.

func (*Reader) Close

func (fastxReader *Reader) Close()

Close cleans up everything, the most important thing is recyling the reader. Please do remember to calls this method!!!

func (*Reader) Read

func (fastxReader *Reader) Read() (*Record, error)

Read reads and return one FASTA/Q record. Note that, similar to bytes.Buffer.Bytes() method, the current record will change after your another call of this method. So, you could use record.Clone() to make a copy.

func (*Reader) Reset added in v0.10.0

func (fastxReader *Reader) Reset()

type Record

type Record struct {
	ID   []byte   // id
	Name []byte   // full name
	Desc []byte   // Description
	Seq  *seq.Seq // seq
}

Record is a struct for FASTA/Q

func GetSeqs

func GetSeqs(file string, alphabet *seq.Alphabet, bufferSize int, chunkSize int, idRegexp string) ([]*Record, error)

GetSeqs return fastx records of a file. when alphabet is nil or seq.Unlimit, it will automaticlly detect the alphabet. when idRegexp is "", default idRegexp ( ^([^\s]+)\s? ) will be used.

func NewRecord

func NewRecord(t *seq.Alphabet, id, name, desc, s []byte) (*Record, error)

NewRecord is constructor of type Record for FASTA

func NewRecordWithQual

func NewRecordWithQual(t *seq.Alphabet, id, name, desc, s, q []byte) (*Record, error)

NewRecordWithQual is constructor of type Record for FASTQ

func NewRecordWithQualWithoutValidation

func NewRecordWithQualWithoutValidation(t *seq.Alphabet, id, name, desc, s, q []byte) (*Record, error)

NewRecordWithQualWithoutValidation is constructor of type Record for FASTQ

func NewRecordWithSeq

func NewRecordWithSeq(id, name, desc []byte, s *seq.Seq) (*Record, error)

NewRecordWithSeq is constructor of type Record for FASTA with a existed seq.Seq object

func NewRecordWithoutValidation

func NewRecordWithoutValidation(t *seq.Alphabet, id, name, desc, s []byte) (*Record, error)

NewRecordWithoutValidation is constructor of type Record for FASTA without validation of the sequence

func (*Record) Clone

func (record *Record) Clone() *Record

Clone of a Record

func (*Record) Format

func (record *Record) Format(width int) []byte

Format returns formated (wrapped with fixed length of) sequence record

func (*Record) FormatToWriter

func (record *Record) FormatToWriter(outfh *xopen.Writer, width int)

FormatToWriter formats and directly writes to writer

func (*Record) String

func (record *Record) String() string

type RecordChunk

type RecordChunk struct {
	ID   uint64
	Data []*Record
	Err  error
}

RecordChunk is chunk for records

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL