speech

package
v0.0.0-...-a7203c7 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: May 29, 2015 License: BSD-3-Clause Imports: 3 Imported by: 0

Documentation

Overview

Package speech provides functionality to parametrize digital waveforms. It computes cepstral features using a sequence of short-term discrete Fourier transforms. Log filterbanks are computed from teh DFT and finally the cepstrum is computed using the discrete cosine transform.

Originally developed for speech recognition, this representation can be used as the basis for other applications.

This package was implemented using package github.com/akualab/dsp which should make it wasy to modify and adapt to any application.

Index

Constants

This section is empty.

Variables

View Source
var (
	// MelFilterbankIndices are the indices of the filters in the filterbank.
	MelFilterbankIndices = []int{10, 11, 14, 17, 20, 23, 27, 30, 33, 36, 40, 45, 50, 56, 62, 69, 76, 84}
	// MelFilterbankCoefficients is a hardcoded filterbank for the speech example.
	MelFilterbankCoefficients = [][]float64{
		[]float64{1.0, 1.0, 1.0, 1.0, 0.66, 0.33},
		[]float64{0.33, 0.66, 1.0, 1.0, 1.0, 1.0, 0.66, 0.33},
		[]float64{0.33, 0.66, 1.0, 1.0, 1.0, 1.0, 0.66, 0.33},
		[]float64{0.33, 0.66, 1.0, 1.0, 1.0, 1.0, 0.75, 0.5, 0.25},
		[]float64{0.33, 0.66, 1.0, 1.0, 1.0, 1.0, 1.0, 0.66, 0.33},
		[]float64{0.25, 0.5, 0.75, 1.0, 1.0, 1.0, 1.0, 0.66, 0.33},
		[]float64{0.33, 0.66, 1.0, 1.0, 1.0, 1.0, 0.66, 0.33},
		[]float64{0.33, 0.66, 1.0, 1.0, 1.0, 1.0, 0.75, 0.5, 0.25},
		[]float64{0.33, 0.66, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.6, 0.4, 0.2},
		[]float64{0.25, 0.5, 0.75, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.6, 0.4, 0.2},
		[]float64{0.2, 0.4, 0.6, 0.8, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.83, 0.66, 0.5, 0.33, 0.16},
		[]float64{0.2, 0.4, 0.6, 0.8, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.83, 0.66, 0.5, 0.33, 0.16},
		[]float64{0.16, 0.33, 0.5, 0.66, 0.83, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.85, 0.71, 0.57, 0.42, 0.28, 0.14},
		[]float64{0.16, 0.33, 0.5, 0.66, 0.83, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.85, 0.71, 0.57, 0.42, 0.28, 0.14},
		[]float64{0.14, 0.28, 0.42, 0.57, 0.71, 0.85, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.875, 0.75, 0.625, 0.5, 0.375, 0.25, 0.125},
		[]float64{0.142, 0.285, 0.428, 0.571, 0.714, 0.857, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.88, 0.77, 0.66, 0.55, 0.44, 0.33, 0.22, 0.11},
		[]float64{0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.88, 0.77, 0.66, 0.55, 0.44, 0.33, 0.22, 0.11},
		[]float64{0.11, 0.22, 0.33, 0.44, 0.55, 0.66, 0.77, 0.88, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0},
	}
)
View Source
var DefaultFeatures = []string{
	"normalized cepstral energy",
	"delta energy",
	"delta delta energy",
	"zm cepstrum",
	"delta cepstrum",
	"delta delta cepstrum",
}

DefaultFeatures has a list of the default feature names.

Functions

func New

func New(name string, source *wav.SourceProc, c Config) (*dsp.App, error)

New creates a new speech dsp app.

Types

type Config

type Config struct {
	// Sampling rate.
	FS float64
	// Processsor buffer size.
	BufSize int
	// Frame size in samples.
	WinSize int
	// Frame advance step in samples.
	WinStep int
	// Window Type (0: Rect, 1: Hann, 2: Hamm, 3: Blackman)
	WinType int
	// Log of the FFT size in samples.
	LogFFTSize int
	// Number fo filterbank elements.
	FBSize int
	// Min filterbank frequency.
	FBMinFreq float64
	// Max filterbank frequency.
	FBMaxFreq float64
	// Number of cepstral elements.
	CepSize int
	// Coefficients for computing deltas.
	DeltaCoeff []float64
	// Name of the feature(s).
	Features []string
}

Config parameters for speech feature extractor.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL