metrics

package
v0.6.2 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Sep 9, 2024 License: MIT Imports: 4 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var (
	// AssessmentKeyFilesExecuted holds the successfully executed files.
	AssessmentKeyFilesExecuted = RegisterAssessmentKey("files-executed", 1)
	// AssessmentKeyFilesExecutedMaximumReachable holds the maximum theoretically reachable executed files.
	AssessmentKeyFilesExecutedMaximumReachable = RegisterAssessmentKey("files-executed-maximum-reachable", 0)
	// AssessmentKeyProcessingTime holds the time in milliseconds that it took to complete the task.
	AssessmentKeyProcessingTime = RegisterAssessmentKey("processing-time", 0)

	// AssessmentKeyCoverage counts execution coverage objects.
	AssessmentKeyCoverage = RegisterAssessmentKey("coverage", 10)

	// AssessmentKeyTestsPassing holds the percentage of passing tests.
	AssessmentKeyTestsPassing = RegisterAssessmentKey("tests-passing", 10)

	// AssessmentKeyResponseCharacterCount counts the number of characters of a response.
	AssessmentKeyResponseCharacterCount = RegisterAssessmentKey("response-character-count", 0)
	// AssessmentKeyGenerateTestsForFileCharacterCount counts the number of characters of a generated test file.
	AssessmentKeyGenerateTestsForFileCharacterCount = RegisterAssessmentKey("generate-tests-for-file-character-count", 0)

	// AssessmentKeyResponseNoError indicates that a model responded without error.
	AssessmentKeyResponseNoError = RegisterAssessmentKey("response-no-error", 1)
	// AssessmentKeyResponseWithCode indicates that a model responded with code.
	AssessmentKeyResponseWithCode = RegisterAssessmentKey("response-with-code", 1)
	// AssessmentKeyResponseNoExcess indicates that a model did not produce more content as requested.
	// TODO Infer if a model produced "too much" code. https://github.com/symflower/eval-dev-quality/issues/44
	AssessmentKeyResponseNoExcess = RegisterAssessmentKey("response-no-excess", 1)
)
View Source
var (
	// AssessmentCategoryUnknown indicates that it is not possible to compute a model's category.
	AssessmentCategoryUnknown = registerAssessmentCategory(AssessmentCategory{
		ID:          "category-unknown",
		Name:        "category unknown",
		Description: "Models in this category could not be categorized.",
	})
	// AssessmentCategoryResponseError indicates that a model has encountered an error trying to produce a response.
	AssessmentCategoryResponseError = registerAssessmentCategory(AssessmentCategory{
		ID:          "response-error",
		Name:        "response error",
		Description: "Models in this category encountered an error.",
	})
	// AssessmentCategoryResponseNoCode indicates that a model's response did not contain any source code.
	AssessmentCategoryResponseNoCode = registerAssessmentCategory(AssessmentCategory{
		ID:          "response-no-code",
		Name:        "no code",
		Description: "Models in this category produced no code.",
	})
	// AssessmentCategoryCodeInvalid indicates that a model's generated code produced an error when executed.
	AssessmentCategoryCodeInvalid = registerAssessmentCategory(AssessmentCategory{
		ID:          "code-invalid",
		Name:        "invalid code",
		Description: "Models in this category produced invalid code.",
	})
	// AssessmentCategoryCodeExecuted indicates that a model's generated code could be executed without an error.
	AssessmentCategoryCodeExecuted = registerAssessmentCategory(AssessmentCategory{
		ID:          "code-executed",
		Name:        "executable code",
		Description: "Models in this category produced executable code.",
	})
	// AssessmentCategoryCodeCoverageStatementReached indicates that a model's generated code reached 100% statement coverage.
	AssessmentCategoryCodeCoverageStatementReached = registerAssessmentCategory(AssessmentCategory{
		ID:          "code-coverage-statement",
		Name:        "statement coverage reached",
		Description: "Models in this category produced code that reached full statement coverage.",
	})
	// AssessmentCategoryCodeNoExcess indicates that a model's response did not contain more content than requested.
	AssessmentCategoryCodeNoExcess = registerAssessmentCategory(AssessmentCategory{
		ID:          "code-no-excess",
		Name:        "no excess response",
		Description: "Models in this category did not respond with more content than requested.",
	})
)
View Source
var AllAssessmentCategories []*AssessmentCategory

AllAssessmentCategories holds all assessment categories.

View Source
var (

	// AllAssessmentKeysStrings returns all registered assessment keys as strings.
	AllAssessmentKeysStrings []string
)

Functions

This section is empty.

Types

type AssessmentCategory

type AssessmentCategory struct {
	// ID holds a unique identifier.
	ID string
	// Name holds a short name.
	Name string
	// Description holds the description.
	Description string
}

AssessmentCategory represents a categorical ranking of a model based on Assessments.

type AssessmentKey

type AssessmentKey string

AssessmentKey defines a key for a numerical key-value assessment pair.

func RegisterAssessmentKey

func RegisterAssessmentKey(key string, multiplier uint64) AssessmentKey

RegisterAssessmentKey registers a new assessment key. If the multiplier for this assessment type is zero, it is ignored for the score computation.

type Assessments

type Assessments map[AssessmentKey]uint64

Assessments holds a collection of numerical assessment metrics.

func CombineWithSymflowerFixAssessments added in v0.6.0

func CombineWithSymflowerFixAssessments(model Assessments, fixed Assessments) (combined Assessments)

CombineWithSymflowerFixAssessments combines the model assessments with the ones from "symflower fix".

func Merge

func Merge(a Assessments, b Assessments) (c Assessments)

Merge combines two assessment collections into a new assessment collection and returns the new assessment collection.

func NewAssessments

func NewAssessments() Assessments

NewAssessments creates a new assessment collection.

func (Assessments) Add

func (a Assessments) Add(x Assessments)

Add adds the given assessment collection to the current one.

func (Assessments) Award

func (a Assessments) Award(key AssessmentKey)

Award yields the score points defined for the given key.

func (Assessments) AwardPoints

func (a Assessments) AwardPoints(key AssessmentKey, count uint64)

AwardPoints yields multiple score points defined for the given key.

func (Assessments) Category

func (a Assessments) Category(totalTasks uint64) *AssessmentCategory

Category infers a categorical ranking of a model based on assessment values. A models overall category corresponds to the criterion where the model was consistently able to receive "total" amount of points. I.e. if there were 3 tasks in total and a model was able to produce executing code for all tasks, but only in one case the coverage goal was reached, then the category is only "CodeExecuted" because the coverage goal was not reached consistently. The returned category is never "nil".

func (Assessments) Equal

func (a Assessments) Equal(x Assessments) bool

Equal checks if both assessment collections are equal.

func (Assessments) Score

func (a Assessments) Score() (score uint64)

Score computes the score over all assessments in the collection.

func (Assessments) String

func (a Assessments) String() string

String returns a string representation of the metrics.

func (Assessments) StringCSV

func (a Assessments) StringCSV() (row []string)

StringCSV returns a CSV row string representation of the metrics.

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL