gff

package
v0.31.2 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Oct 21, 2024 License: MIT Imports: 9 Imported by: 0

Documentation

Overview

Package gff provides gff parsers and writers.

GFF stands for "general feature format". It is an alternative to GenBank for storing data about genomic sequences. While not often used in synthetic biology research, it is more commonly used in bioinformatics for digesting features of genomic sequences.

This package provides a parser and writer to convert between the gff file format and the more general poly.Sequence struct.

Example (Basic)

This example shows how to open a gff file and search for a gene given its locus tag. We then display the EC number of that particular gene.

package main

import (
	"fmt"

	"github.com/bebop/poly/io/gff"
)

func main() {
	sequence, _ := gff.Read("../../data/ecoli-mg1655-short.gff")
	for _, feature := range sequence.Features {
		if feature.Attributes["locus_tag"] == "b0003" {
			fmt.Println(feature.Attributes["EC_number"])
		}
	}
}
Output:

2.7.1.39

Index

Examples

Constants

This section is empty.

Variables

This section is empty.

Functions

func Build

func Build(sequence Gff) ([]byte, error)

Build takes an Annotated sequence and returns a byte array representing a gff to be written out.

Example
package main

import (
	"bytes"
	"fmt"

	"github.com/bebop/poly/io/gff"
)

func main() {
	sequence, _ := gff.Read("../../data/ecoli-mg1655-short.gff")
	gffBytes, _ := gff.Build(sequence)
	gffReader := bytes.NewReader(gffBytes)
	reparsedSequence, _ := gff.Parse(gffReader)

	fmt.Println(reparsedSequence.Meta.Name)
}
Output:

U00096.3

func Write

func Write(sequence Gff, path string) error

Write takes an poly.Sequence struct and a path string and writes out a gff to that path.

Example
package main

import (
	"fmt"
	"os"
	"path/filepath"

	"github.com/bebop/poly/io/gff"
)

func main() {
	tmpDataDir, err := os.MkdirTemp("", "data-*")
	if err != nil {
		fmt.Println(err.Error())
	}
	defer os.RemoveAll(tmpDataDir)

	sequence, _ := gff.Read("../../data/ecoli-mg1655-short.gff")

	tmpGffFilePath := filepath.Join(tmpDataDir, "ecoli-mg1655-short.gff")
	_ = gff.Write(sequence, tmpGffFilePath)

	testSequence, _ := gff.Read(tmpGffFilePath)

	fmt.Println(testSequence.Meta.Name)
}
Output:

U00096.3

Types

type Feature

type Feature struct {
	Name           string            `json:"name"`
	Source         string            `json:"source"`
	Type           string            `json:"type"`
	Score          string            `json:"score"`
	Strand         string            `json:"strand"`
	Phase          string            `json:"phase"`
	Attributes     map[string]string `json:"attributes"`
	Location       Location          `json:"location"`
	ParentSequence *Gff              `json:"-"`
}

Feature is a struct that represents a feature in a gff file.

func (Feature) GetSequence

func (feature Feature) GetSequence() (string, error)

GetSequence takes a feature and returns a sequence string for that feature.

Example
package main

import (
	"fmt"

	"github.com/bebop/poly/io/gff"
)

func main() {
	// Sequence for greenflourescent protein (GFP) that we're using as test data for this example.
	gfpSequence := "ATGGCTAGCAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAATTAGATGGTGATGTTAATGGGCACAAATTTTCTGTCAGTGGAGAGGGTGAAGGTGATGCTACATACGGAAAGCTTACCCTTAAATTTATTTGCACTACTGGAAAACTACCTGTTCCATGGCCAACACTTGTCACTACTTTCTCTTATGGTGTTCAATGCTTTTCCCGTTATCCGGATCATATGAAACGGCATGACTTTTTCAAGAGTGCCATGCCCGAAGGTTATGTACAGGAACGCACTATATCTTTCAAAGATGACGGGAACTACAAGACGCGTGCTGAAGTCAAGTTTGAAGGTGATACCCTTGTTAATCGTATCGAGTTAAAAGGTATTGATTTTAAAGAAGATGGAAACATTCTCGGACACAAACTCGAGTACAACTATAACTCACACAATGTATACATCACGGCAGACAAACAAAAGAATGGAATCAAAGCTAACTTCAAAATTCGCCACAACATTGAAGATGGATCCGTTCAACTAGCAGACCATTATCAACAAAATACTCCAATTGGCGATGGCCCTGTCCTTTTACCAGACAACCATTACCTGTCGACACAATCTGCCCTTTCGAAAGATCCCAACGAAAAGCGTGACCACATGGTCCTTCTTGAGTTTGTAACTGCTGCTGGGATTACACATGGCATGGATGAGCTCTACAAATAA"

	// initialize sequence and feature structs.
	var sequence gff.Gff
	var feature gff.Feature

	// set the initialized sequence struct's sequence.
	sequence.Sequence = gfpSequence

	// Set the initialized feature name and sequence location.
	feature.Location.Start = 0
	feature.Location.End = len(sequence.Sequence)

	// Add the GFP feature to the sequence struct.
	_ = sequence.AddFeature(&feature)

	// get the GFP feature sequence string from the sequence struct.
	featureSequence, _ := feature.GetSequence()

	// check to see if the feature was inserted properly into the sequence.
	fmt.Println(gfpSequence == featureSequence)

}
Output:

true

type Gff

type Gff struct {
	Meta     Meta
	Features []Feature // will need a GetFeatures interface to standardize
	Sequence string
}

Gff is a struct that represents a gff file.

func Parse

func Parse(file io.Reader) (Gff, error)

Parse Takes in a string representing a gffv3 file and parses it into an Sequence object.

Example
package main

import (
	"fmt"
	"os"

	"github.com/bebop/poly/io/gff"
)

func main() {
	file, _ := os.Open("../../data/ecoli-mg1655-short.gff")
	sequence, _ := gff.Parse(file)

	fmt.Println(sequence.Meta.Name)
}
Output:

U00096.3

func Read

func Read(path string) (Gff, error)

Read takes in a filepath for a .gffv3 file and parses it into an Annotated poly.Sequence struct.

Example
package main

import (
	"fmt"

	"github.com/bebop/poly/io/gff"
)

func main() {
	sequence, _ := gff.Read("../../data/ecoli-mg1655-short.gff")
	fmt.Println(sequence.Meta.Name)
}
Output:

U00096.3

func (*Gff) AddFeature

func (sequence *Gff) AddFeature(feature *Feature) error

AddFeature takes a feature and adds it to the Gff struct.

Example
package main

import (
	"fmt"

	"github.com/bebop/poly/io/gff"
)

func main() {
	// Sequence for greenflourescent protein (GFP) that we're using as test data for this example.
	gfpSequence := "ATGGCTAGCAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAATTAGATGGTGATGTTAATGGGCACAAATTTTCTGTCAGTGGAGAGGGTGAAGGTGATGCTACATACGGAAAGCTTACCCTTAAATTTATTTGCACTACTGGAAAACTACCTGTTCCATGGCCAACACTTGTCACTACTTTCTCTTATGGTGTTCAATGCTTTTCCCGTTATCCGGATCATATGAAACGGCATGACTTTTTCAAGAGTGCCATGCCCGAAGGTTATGTACAGGAACGCACTATATCTTTCAAAGATGACGGGAACTACAAGACGCGTGCTGAAGTCAAGTTTGAAGGTGATACCCTTGTTAATCGTATCGAGTTAAAAGGTATTGATTTTAAAGAAGATGGAAACATTCTCGGACACAAACTCGAGTACAACTATAACTCACACAATGTATACATCACGGCAGACAAACAAAAGAATGGAATCAAAGCTAACTTCAAAATTCGCCACAACATTGAAGATGGATCCGTTCAACTAGCAGACCATTATCAACAAAATACTCCAATTGGCGATGGCCCTGTCCTTTTACCAGACAACCATTACCTGTCGACACAATCTGCCCTTTCGAAAGATCCCAACGAAAAGCGTGACCACATGGTCCTTCTTGAGTTTGTAACTGCTGCTGGGATTACACATGGCATGGATGAGCTCTACAAATAA"

	// initialize sequence and feature structs.
	var sequence gff.Gff
	var feature gff.Feature

	// set the initialized sequence struct's sequence.
	sequence.Sequence = gfpSequence

	// Set the initialized feature name and sequence location.
	feature.Location = gff.Location{}
	feature.Location.Start = 0
	feature.Location.End = len(sequence.Sequence)

	// Add the GFP feature to the sequence struct.
	_ = sequence.AddFeature(&feature)

	// get the GFP feature sequence string from the sequence struct.
	featureSequence, _ := feature.GetSequence()

	// check to see if the feature was inserted properly into the sequence.
	fmt.Println(gfpSequence == featureSequence)

}
Output:

true

type Location

type Location struct {
	Start             int        `json:"start"`
	End               int        `json:"end"`
	Complement        bool       `json:"complement"`
	Join              bool       `json:"join"`
	FivePrimePartial  bool       `json:"five_prime_partial"`
	ThreePrimePartial bool       `json:"three_prime_partial"`
	SubLocations      []Location `json:"sub_locations"`
}

Location is a struct that represents a location in a gff file.

type Meta

type Meta struct {
	Name                 string   `json:"name"`
	Description          string   `json:"description"`
	Version              string   `json:"gff_version"`
	RegionStart          int      `json:"region_start"`
	RegionEnd            int      `json:"region_end"`
	Size                 int      `json:"size"`
	SequenceHash         string   `json:"sequence_hash"`
	SequenceHashFunction string   `json:"hash_function"`
	CheckSum             [32]byte `json:"checkSum"` // blake3 checksum of the parsed file itself. Useful for if you want to check if incoming genbank/gff files are different.
}

Meta holds meta information about a gff file.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL