3 Star 0 Fork 0

Gitee 极速下载 / bayesian

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
此仓库是为了提升国内下载速度的镜像仓库,每日同步一次。 原始仓库: https://github.com/jbrukh/bayesian
克隆/下载
bayesian.go 15.29 KB
一键复制 编辑 原始数据 按行查看 历史
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557
package bayesian
import (
"encoding/gob"
"errors"
"io"
"math"
"os"
"path/filepath"
"sync/atomic"
)
// defaultProb is the tiny non-zero probability that a word
// we have not seen before appears in the class.
const defaultProb = 0.00000000001
// ErrUnderflow is returned when an underflow is detected.
var ErrUnderflow = errors.New("possible underflow detected")
// Class defines a class that the classifier will filter:
// C = {C_1, ..., C_n}. You should define your classes as a
// set of constants, for example as follows:
//
// const (
// Good Class = "Good"
// Bad Class = "Bad
// )
//
// Class values should be unique.
type Class string
// Classifier implements the Naive Bayesian Classifier.
type Classifier struct {
Classes []Class
learned int // docs learned
seen int32 // docs seen
datas map[Class]*classData
tfIdf bool
DidConvertTfIdf bool // we can't classify a TF-IDF classifier if we haven't yet
// called ConverTermsFreqToTfIdf
}
// serializableClassifier represents a container for
// Classifier objects whose fields are modifiable by
// reflection and are therefore writeable by gob.
type serializableClassifier struct {
Classes []Class
Learned int
Seen int
Datas map[Class]*classData
TfIdf bool
DidConvertTfIdf bool
}
// classData holds the frequency data for words in a
// particular class. In the future, we may replace this
// structure with a trie-like structure for more
// efficient storage.
type classData struct {
Freqs map[string]float64
FreqTfs map[string][]float64
Total int
}
// newClassData creates a new empty classData node.
func newClassData() *classData {
return &classData{
Freqs: make(map[string]float64),
FreqTfs: make(map[string][]float64),
}
}
// getWordProb returns P(W|C_j) -- the probability of seeing
// a particular word W in a document of this class.
func (d *classData) getWordProb(word string) float64 {
value, ok := d.Freqs[word]
if !ok {
return defaultProb
}
return float64(value) / float64(d.Total)
}
// getWordsProb returns P(D|C_j) -- the probability of seeing
// this set of words in a document of this class.
//
// Note that words should not be empty, and this method of
// calulation is prone to underflow if there are many words
// and their individual probabilties are small.
func (d *classData) getWordsProb(words []string) (prob float64) {
prob = 1
for _, word := range words {
prob *= d.getWordProb(word)
}
return
}
// NewClassifierTfIdf returns a new classifier. The classes provided
// should be at least 2 in number and unique, or this method will
// panic.
func NewClassifierTfIdf(classes ...Class) (c *Classifier) {
n := len(classes)
// check size
if n < 2 {
panic("provide at least two classes")
}
// check uniqueness
check := make(map[Class]bool, n)
for _, class := range classes {
check[class] = true
}
if len(check) != n {
panic("classes must be unique")
}
// create the classifier
c = &Classifier{
Classes: classes,
datas: make(map[Class]*classData, n),
tfIdf: true,
}
for _, class := range classes {
c.datas[class] = newClassData()
}
return
}
// NewClassifier returns a new classifier. The classes provided
// should be at least 2 in number and unique, or this method will
// panic.
func NewClassifier(classes ...Class) (c *Classifier) {
n := len(classes)
// check size
if n < 2 {
panic("provide at least two classes")
}
// check uniqueness
check := make(map[Class]bool, n)
for _, class := range classes {
check[class] = true
}
if len(check) != n {
panic("classes must be unique")
}
// create the classifier
c = &Classifier{
Classes: classes,
datas: make(map[Class]*classData, n),
tfIdf: false,
DidConvertTfIdf: false,
}
for _, class := range classes {
c.datas[class] = newClassData()
}
return
}
// NewClassifierFromFile loads an existing classifier from
// file. The classifier was previously saved with a call
// to c.WriteToFile(string).
func NewClassifierFromFile(name string) (c *Classifier, err error) {
file, err := os.Open(name)
if err != nil {
return nil, err
}
defer file.Close()
return NewClassifierFromReader(file)
}
// NewClassifierFromReader: This actually does the deserializing of a Gob encoded classifier
func NewClassifierFromReader(r io.Reader) (c *Classifier, err error) {
dec := gob.NewDecoder(r)
w := new(serializableClassifier)
err = dec.Decode(w)
return &Classifier{w.Classes, w.Learned, int32(w.Seen), w.Datas, w.TfIdf, w.DidConvertTfIdf}, err
}
// getPriors returns the prior probabilities for the
// classes provided -- P(C_j).
//
// TODO: There is a way to smooth priors, currently
// not implemented here.
func (c *Classifier) getPriors() (priors []float64) {
n := len(c.Classes)
priors = make([]float64, n, n)
sum := 0
for index, class := range c.Classes {
total := c.datas[class].Total
priors[index] = float64(total)
sum += total
}
if sum != 0 {
for i := 0; i < n; i++ {
priors[i] /= float64(sum)
}
}
return
}
// Learned returns the number of documents ever learned
// in the lifetime of this classifier.
func (c *Classifier) Learned() int {
return c.learned
}
// Seen returns the number of documents ever classified
// in the lifetime of this classifier.
func (c *Classifier) Seen() int {
return int(atomic.LoadInt32(&c.seen))
}
// IsTfIdf returns true if we are a classifier of type TfIdf
func (c *Classifier) IsTfIdf() bool {
return c.tfIdf
}
// WordCount returns the number of words counted for
// each class in the lifetime of the classifier.
func (c *Classifier) WordCount() (result []int) {
result = make([]int, len(c.Classes))
for inx, class := range c.Classes {
data := c.datas[class]
result[inx] = data.Total
}
return
}
// Observe should be used when word-frequencies have been already been learned
// externally (e.g., hadoop)
func (c *Classifier) Observe(word string, count int, which Class) {
data := c.datas[which]
data.Freqs[word] += float64(count)
data.Total += count
}
// Learn will accept new training documents for
// supervised learning.
func (c *Classifier) Learn(document []string, which Class) {
// If we are a tfidf classifier we first need to get terms as
// terms frequency and store that to work out the idf part later
// in ConvertToIDF().
if c.tfIdf {
if c.DidConvertTfIdf {
panic("Cannot call ConvertTermsFreqToTfIdf more than once. Reset and relearn to reconvert.")
}
// Term Frequency: word count in document / document length
docTf := make(map[string]float64)
for _, word := range document {
docTf[word]++
}
docLen := float64(len(document))
for wIndex, wCount := range docTf {
docTf[wIndex] = wCount / docLen
// add the TF sample, after training we can get IDF values.
c.datas[which].FreqTfs[wIndex] = append(c.datas[which].FreqTfs[wIndex], docTf[wIndex])
}
}
data := c.datas[which]
for _, word := range document {
data.Freqs[word]++
data.Total++
}
c.learned++
}
// ConvertTermsFreqToTfIdf uses all the TF samples for the class and converts
// them to TF-IDF https://en.wikipedia.org/wiki/Tf%E2%80%93idf
// once we have finished learning all the classes and have the totals.
func (c *Classifier) ConvertTermsFreqToTfIdf() {
if c.DidConvertTfIdf {
panic("Cannot call ConvertTermsFreqToTfIdf more than once. Reset and relearn to reconvert.")
}
for className := range c.datas {
for wIndex := range c.datas[className].FreqTfs {
tfIdfAdder := float64(0)
for tfSampleIndex := range c.datas[className].FreqTfs[wIndex] {
// we always want a possitive TF-IDF score.
tf := c.datas[className].FreqTfs[wIndex][tfSampleIndex]
c.datas[className].FreqTfs[wIndex][tfSampleIndex] = math.Log1p(tf) * math.Log1p(float64(c.learned)/float64(c.datas[className].Total))
tfIdfAdder += c.datas[className].FreqTfs[wIndex][tfSampleIndex]
}
// convert the 'counts' to TF-IDF's
c.datas[className].Freqs[wIndex] = tfIdfAdder
}
}
// sanity check
c.DidConvertTfIdf = true
}
// LogScores produces "log-likelihood"-like scores that can
// be used to classify documents into classes.
//
// The value of the score is proportional to the likelihood,
// as determined by the classifier, that the given document
// belongs to the given class. This is true even when scores
// returned are negative, which they will be (since we are
// taking logs of probabilities).
//
// The index j of the score corresponds to the class given
// by c.Classes[j].
//
// Additionally returned are "inx" and "strict" values. The
// inx corresponds to the maximum score in the array. If more
// than one of the scores holds the maximum values, then
// strict is false.
//
// Unlike c.Probabilities(), this function is not prone to
// floating point underflow and is relatively safe to use.
func (c *Classifier) LogScores(document []string) (scores []float64, inx int, strict bool) {
if c.tfIdf && !c.DidConvertTfIdf {
panic("Using a TF-IDF classifier. Please call ConvertTermsFreqToTfIdf before calling LogScores.")
}
n := len(c.Classes)
scores = make([]float64, n, n)
priors := c.getPriors()
// calculate the score for each class
for index, class := range c.Classes {
data := c.datas[class]
// c is the sum of the logarithms
// as outlined in the refresher
score := math.Log(priors[index])
for _, word := range document {
score += math.Log(data.getWordProb(word))
}
scores[index] = score
}
inx, strict = findMax(scores)
atomic.AddInt32(&c.seen, 1)
return scores, inx, strict
}
// ProbScores works the same as LogScores, but delivers
// actual probabilities as discussed above. Note that float64
// underflow is possible if the word list contains too
// many words that have probabilities very close to 0.
//
// Notes on underflow: underflow is going to occur when you're
// trying to assess large numbers of words that you have
// never seen before. Depending on the application, this
// may or may not be a concern. Consider using SafeProbScores()
// instead.
func (c *Classifier) ProbScores(doc []string) (scores []float64, inx int, strict bool) {
if c.tfIdf && !c.DidConvertTfIdf {
panic("Using a TF-IDF classifier. Please call ConvertTermsFreqToTfIdf before calling ProbScores.")
}
n := len(c.Classes)
scores = make([]float64, n, n)
priors := c.getPriors()
sum := float64(0)
// calculate the score for each class
for index, class := range c.Classes {
data := c.datas[class]
// c is the sum of the logarithms
// as outlined in the refresher
score := priors[index]
for _, word := range doc {
score *= data.getWordProb(word)
}
scores[index] = score
sum += score
}
for i := 0; i < n; i++ {
scores[i] /= sum
}
inx, strict = findMax(scores)
atomic.AddInt32(&c.seen, 1)
return scores, inx, strict
}
// SafeProbScores works the same as ProbScores, but is
// able to detect underflow in those cases where underflow
// results in the reverse classification. If an underflow is detected,
// this method returns an ErrUnderflow, allowing the user to deal with it as
// necessary. Note that underflow, under certain rare circumstances,
// may still result in incorrect probabilities being returned,
// but this method guarantees that all error-less invokations
// are properly classified.
//
// Underflow detection is more costly because it also
// has to make additional log score calculations.
func (c *Classifier) SafeProbScores(doc []string) (scores []float64, inx int, strict bool, err error) {
if c.tfIdf && !c.DidConvertTfIdf {
panic("Using a TF-IDF classifier. Please call ConvertTermsFreqToTfIdf before calling SafeProbScores.")
}
n := len(c.Classes)
scores = make([]float64, n, n)
logScores := make([]float64, n, n)
priors := c.getPriors()
sum := float64(0)
// calculate the score for each class
for index, class := range c.Classes {
data := c.datas[class]
// c is the sum of the logarithms
// as outlined in the refresher
score := priors[index]
logScore := math.Log(priors[index])
for _, word := range doc {
p := data.getWordProb(word)
score *= p
logScore += math.Log(p)
}
scores[index] = score
logScores[index] = logScore
sum += score
}
for i := 0; i < n; i++ {
scores[i] /= sum
}
inx, strict = findMax(scores)
logInx, logStrict := findMax(logScores)
// detect underflow -- the size
// relation between scores and logScores
// must be preserved or something is wrong
if inx != logInx || strict != logStrict {
err = ErrUnderflow
}
atomic.AddInt32(&c.seen, 1)
return scores, inx, strict, err
}
// WordFrequencies returns a matrix of word frequencies that currently
// exist in the classifier for each class state for the given input
// words. In other words, if you obtain the frequencies
//
// freqs := c.WordFrequencies(/* [j]string */)
//
// then the expression freq[i][j] represents the frequency of the j-th
// word within the i-th class.
func (c *Classifier) WordFrequencies(words []string) (freqMatrix [][]float64) {
n, l := len(c.Classes), len(words)
freqMatrix = make([][]float64, n)
for i := range freqMatrix {
arr := make([]float64, l)
data := c.datas[c.Classes[i]]
for j := range arr {
arr[j] = data.getWordProb(words[j])
}
freqMatrix[i] = arr
}
return
}
// WordsByClass returns a map of words and their probability of
// appearing in the given class.
func (c *Classifier) WordsByClass(class Class) (freqMap map[string]float64) {
freqMap = make(map[string]float64)
for word, cnt := range c.datas[class].Freqs {
freqMap[word] = float64(cnt) / float64(c.datas[class].Total)
}
return freqMap
}
// WriteToFile serializes this classifier to a file.
func (c *Classifier) WriteToFile(name string) (err error) {
file, err := os.OpenFile(name, os.O_WRONLY|os.O_CREATE, 0644)
if err != nil {
return err
}
defer file.Close()
return c.WriteTo(file)
}
// WriteClassesToFile writes all classes to files.
func (c *Classifier) WriteClassesToFile(rootPath string) (err error) {
for name := range c.datas {
c.WriteClassToFile(name, rootPath)
}
return
}
// WriteClassToFile writes a single class to file.
func (c *Classifier) WriteClassToFile(name Class, rootPath string) (err error) {
data := c.datas[name]
fileName := filepath.Join(rootPath, string(name))
file, err := os.OpenFile(fileName, os.O_WRONLY|os.O_CREATE, 0644)
if err != nil {
return err
}
defer file.Close()
enc := gob.NewEncoder(file)
err = enc.Encode(data)
return
}
// WriteTo serializes this classifier to GOB and write to Writer.
func (c *Classifier) WriteTo(w io.Writer) (err error) {
enc := gob.NewEncoder(w)
err = enc.Encode(&serializableClassifier{c.Classes, c.learned, int(c.seen), c.datas, c.tfIdf, c.DidConvertTfIdf})
return
}
// ReadClassFromFile loads existing class data from a
// file.
func (c *Classifier) ReadClassFromFile(class Class, location string) (err error) {
fileName := filepath.Join(location, string(class))
file, err := os.Open(fileName)
if err != nil {
return err
}
defer file.Close()
dec := gob.NewDecoder(file)
w := new(classData)
err = dec.Decode(w)
c.learned++
c.datas[class] = w
return
}
// findMax finds the maximum of a set of scores; if the
// maximum is strict -- that is, it is the single unique
// maximum from the set -- then strict has return value
// true. Otherwise it is false.
func findMax(scores []float64) (inx int, strict bool) {
inx = 0
strict = true
for i := 1; i < len(scores); i++ {
if scores[inx] < scores[i] {
inx = i
strict = true
} else if scores[inx] == scores[i] {
strict = false
}
}
return
}
1
https://gitee.com/mirrors/bayesian.git
git@gitee.com:mirrors/bayesian.git
mirrors
bayesian
bayesian
master

搜索帮助