diff --git a/README.md b/README.md index abaeb8bc80cf336a77c28dfba987af2d755c06d2..3513ddf09a195ce2ece23feda8f60f1d99aeee54 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,42 @@ -# pandas +pandas +=== +[![Sourcegraph](https://sourcegraph.com/github.com/quant1x/pandas/-/badge.svg)](https://sourcegraph.com/github.com/quant1x/pandas?badge) +[![Build Status](https://api.travis-ci.com/repos/quant1x/pandas.png)](https://travis-ci.com/quant1x/pandas) +[![codecov](https://codecov.io/gh/quant1x/pandas/branch/master/graph/badge.svg)](https://codecov.io/gh/quant1x/pandas) +![Golang 1.11.4+](https://img.shields.io/badge/Golang-1.20+-orange.svg?style=flat) +![tag](https://img.shields.io/github/tag/quant1x/pandas.svg?style=flat) +![license](https://img.shields.io/github/license/quant1x/pandas.svg) ## 1. 介绍 -python pandas库的golang版本的整合 -## 2. 整合的代码库有: -- https://github.com/go-gota/gota [gota介绍](gota/README-gota.md) +golang版本的pandas + +## 2. 功能/模块划分 + +### 2.1 特性列表 +| 模块 | 一级功能 | 二级功能 | 进展情况 | +|:----------|:--------------|:----------|:-----------------------------------| +| dataframe | dataframe | new | [√] | +| dataframe | 类型约束 | string | [√] | +| dataframe | 类型约束 | bool | [√] | +| dataframe | 类型约束 | int64 | [√] | +| dataframe | 类型约束 | float64 | [√] | +| dataframe | 泛型类型 | 支持全部的基础类型 | [√] | +| dataframe | 泛型类型 | 自动检测类型 | [√]优先级:string > bool > float > int | +| series | series | new | [√] series的列元素类型和reflect.Kind保持一致 | +| series | 伪泛型 | 构建 | [√] 再新建series完成之后类型就确定了 | +| series | SeriesBool | bool类型 | [√] | +| series | SeriesString | string类型 | [√] | +| series | SeriesInt64 | int64类型 | [√] | +| series | SeriesFloat64 | float64类型 | [√] | + + + +## 3. 示例 + +### 3.1. dataframe +### 3.2. series + +## 4. 参考的的代码: +- https://github.com/go-gota/gota - https://github.com/WinPooh32/series - https://github.com/rocketlaunchr/dataframe-go.git -## 3. 整合的目的 -- gota 缺少 ExponentialMovingWindow的用法, 从而缺失EWM的后续操作方法 -- WinPooh32的series功能比较全, 缺少DataFrame的衔接 -- 整合最终的目标是对标公式指标类似myTT的实现方法 diff --git a/algorithms/avx2/bool.go b/algorithms/avx2/bool.go new file mode 100644 index 0000000000000000000000000000000000000000..c603649889d73a43a9ef89e1617a6149ed420869 --- /dev/null +++ b/algorithms/avx2/bool.go @@ -0,0 +1,7 @@ +package avx2 + +import "github.com/viterin/vek" + +func ToBool(x []float64) []bool { + return vek.ToBool(x) +} diff --git a/algorithms/utils.go b/algorithms/utils.go new file mode 100644 index 0000000000000000000000000000000000000000..a7454008c166f8eae1e638f7fa143854fb9b0c50 --- /dev/null +++ b/algorithms/utils.go @@ -0,0 +1,18 @@ +package algorithms + +import "math" + +func WantFloat(got, want float64) bool { + return got != want && !(math.IsNaN(want) && math.IsNaN(got)) +} + +func SliceWantFloat(got, want []float64) bool { + b := 0 + for i := 0; i < len(got); i++ { + b1 := got[i] == want[i] || (math.IsNaN(want[i]) && math.IsNaN(got[i])) + if b1 { + b += 1 + } + } + return b == len(got) +} diff --git a/dataframe.go b/dataframe.go index 218ddc667c6f1814d1fe7964bfbdcbf7814b7bb8..1f0505cb778a4172acec8b5bd76eef5f97926923 100644 --- a/dataframe.go +++ b/dataframe.go @@ -3,9 +3,6 @@ package pandas import ( "fmt" "sort" - "strconv" - "strings" - "unicode/utf8" ) // DataFrame 以gota的DataFrame的方法为主, 兼顾新流程, 避免单元格元素结构化 @@ -72,162 +69,12 @@ func (df DataFrame) Ncol() int { return df.ncols } -// String implements the Stringer interface for DataFrame -func (df DataFrame) String() (str string) { - return df.print(true, true, true, true, 10, 70, "DataFrame") -} - // Returns error or nil if no error occured func (df *DataFrame) Error() error { return df.Err } -func (df DataFrame) print( - shortRows, shortCols, showDims, showTypes bool, - maxRows int, - maxCharsTotal int, - class string) (str string) { - - addRightPadding := func(s string, nchar int) string { - if utf8.RuneCountInString(s) < nchar { - return s + strings.Repeat(" ", nchar-utf8.RuneCountInString(s)) - } - return s - } - - addLeftPadding := func(s string, nchar int) string { - if utf8.RuneCountInString(s) < nchar { - return strings.Repeat(" ", nchar-utf8.RuneCountInString(s)) + s - } - return s - } - - if df.Err != nil { - str = fmt.Sprintf("%s error: %v", class, df.Err) - return - } - nrows, ncols := df.Dims() - if nrows == 0 || ncols == 0 { - str = fmt.Sprintf("Empty %s", class) - return - } - //idx := make([]int, maxRows) - //for i := 0; i < len(idx); i++ { - // idx[i] = i - //} - var records [][]string - shortening := false - if shortRows && nrows > maxRows { - shortening = true - df = df.Subset(0, maxRows) - records = df.Records() - } else { - records = df.Records() - } - - if showDims { - str += fmt.Sprintf("[%dx%d] %s\n\n", nrows, ncols, class) - } - - // Add the row numbers - for i := 0; i < df.nrows+1; i++ { - add := "" - if i != 0 { - add = strconv.Itoa(i-1) + ":" - } - records[i] = append([]string{add}, records[i]...) - } - if shortening { - dots := make([]string, ncols+1) - for i := 1; i < ncols+1; i++ { - dots[i] = "..." - } - records = append(records, dots) - } - types := df.Types() - typesrow := make([]string, ncols) - for i := 0; i < ncols; i++ { - typesrow[i] = fmt.Sprintf("<%v>", types[i]) - } - typesrow = append([]string{""}, typesrow...) - - if showTypes { - records = append(records, typesrow) - } - - maxChars := make([]int, df.ncols+1) - for i := 0; i < len(records); i++ { - for j := 0; j < df.ncols+1; j++ { - // Escape special characters - records[i][j] = strconv.Quote(records[i][j]) - records[i][j] = records[i][j][1 : len(records[i][j])-1] - - // Detect maximum number of characters per column - if len(records[i][j]) > maxChars[j] { - maxChars[j] = utf8.RuneCountInString(records[i][j]) - } - } - } - maxCols := len(records[0]) - var notShowing []string - if shortCols { - maxCharsCum := 0 - for colnum, m := range maxChars { - maxCharsCum += m - if maxCharsCum > maxCharsTotal { - maxCols = colnum - break - } - } - notShowingNames := records[0][maxCols:] - notShowingTypes := typesrow[maxCols:] - notShowing = make([]string, len(notShowingNames)) - for i := 0; i < len(notShowingNames); i++ { - notShowing[i] = fmt.Sprintf("%s %s", notShowingNames[i], notShowingTypes[i]) - } - } - for i := 0; i < len(records); i++ { - // Add right padding to all elements - records[i][0] = addLeftPadding(records[i][0], maxChars[0]+1) - for j := 1; j < df.ncols; j++ { - records[i][j] = addRightPadding(records[i][j], maxChars[j]) - } - records[i] = records[i][0:maxCols] - if shortCols && len(notShowing) != 0 { - records[i] = append(records[i], "...") - } - // Create the final string - str += strings.Join(records[i], " ") - str += "\n" - } - if shortCols && len(notShowing) != 0 { - var notShown string - var notShownArr [][]string - cum := 0 - i := 0 - for n, ns := range notShowing { - cum += len(ns) - if cum > maxCharsTotal { - notShownArr = append(notShownArr, notShowing[i:n]) - cum = 0 - i = n - } - } - if i < len(notShowing) { - notShownArr = append(notShownArr, notShowing[i:]) - } - for k, ns := range notShownArr { - notShown += strings.Join(ns, ", ") - if k != len(notShownArr)-1 { - notShown += "," - } - notShown += "\n" - } - str += fmt.Sprintf("\nNot Showing: %s", notShown) - } - return str -} - +// 检查列的尺寸 func checkColumnsDimensions(se ...Series) (nrows, ncols int, err error) { ncols = len(se) nrows = -1 diff --git a/dataframe_xstring.go b/dataframe_xstring.go new file mode 100644 index 0000000000000000000000000000000000000000..bbcc5e475051157659efa7bbef53784055ff6bf2 --- /dev/null +++ b/dataframe_xstring.go @@ -0,0 +1,159 @@ +package pandas + +import ( + "fmt" + "strconv" + "strings" + "unicode/utf8" +) + +// String implements the Stringer interface for DataFrame +func (df DataFrame) String() (str string) { + return df.print(true, true, true, true, 10, 70, "DataFrame") +} + +func (df DataFrame) print( + shortRows, shortCols, showDims, showTypes bool, + maxRows int, + maxCharsTotal int, + class string) (str string) { + + addRightPadding := func(s string, nchar int) string { + if utf8.RuneCountInString(s) < nchar { + return s + strings.Repeat(" ", nchar-utf8.RuneCountInString(s)) + } + return s + } + + addLeftPadding := func(s string, nchar int) string { + if utf8.RuneCountInString(s) < nchar { + return strings.Repeat(" ", nchar-utf8.RuneCountInString(s)) + s + } + return s + } + + if df.Err != nil { + str = fmt.Sprintf("%s error: %v", class, df.Err) + return + } + nrows, ncols := df.Dims() + if nrows == 0 || ncols == 0 { + str = fmt.Sprintf("Empty %s", class) + return + } + //idx := make([]int, maxRows) + //for i := 0; i < len(idx); i++ { + // idx[i] = i + //} + var records [][]string + shortening := false + if shortRows && nrows > maxRows { + shortening = true + df = df.Subset(0, maxRows) + records = df.Records() + } else { + records = df.Records() + } + + if showDims { + str += fmt.Sprintf("[%dx%d] %s\n\n", nrows, ncols, class) + } + + // Add the row numbers + for i := 0; i < df.nrows+1; i++ { + add := "" + if i != 0 { + add = strconv.Itoa(i-1) + ":" + } + records[i] = append([]string{add}, records[i]...) + } + if shortening { + dots := make([]string, ncols+1) + for i := 1; i < ncols+1; i++ { + dots[i] = "..." + } + records = append(records, dots) + } + types := df.Types() + typesrow := make([]string, ncols) + for i := 0; i < ncols; i++ { + typesrow[i] = fmt.Sprintf("<%v>", types[i]) + } + typesrow = append([]string{""}, typesrow...) + + if showTypes { + records = append(records, typesrow) + } + + maxChars := make([]int, df.ncols+1) + for i := 0; i < len(records); i++ { + for j := 0; j < df.ncols+1; j++ { + // Escape special characters + records[i][j] = strconv.Quote(records[i][j]) + records[i][j] = records[i][j][1 : len(records[i][j])-1] + + // Detect maximum number of characters per column + if len(records[i][j]) > maxChars[j] { + maxChars[j] = utf8.RuneCountInString(records[i][j]) + } + } + } + maxCols := len(records[0]) + var notShowing []string + if shortCols { + maxCharsCum := 0 + for colnum, m := range maxChars { + maxCharsCum += m + if maxCharsCum > maxCharsTotal { + maxCols = colnum + break + } + } + notShowingNames := records[0][maxCols:] + notShowingTypes := typesrow[maxCols:] + notShowing = make([]string, len(notShowingNames)) + for i := 0; i < len(notShowingNames); i++ { + notShowing[i] = fmt.Sprintf("%s %s", notShowingNames[i], notShowingTypes[i]) + } + } + for i := 0; i < len(records); i++ { + // Add right padding to all elements + records[i][0] = addLeftPadding(records[i][0], maxChars[0]+1) + for j := 1; j < df.ncols; j++ { + records[i][j] = addRightPadding(records[i][j], maxChars[j]) + } + records[i] = records[i][0:maxCols] + if shortCols && len(notShowing) != 0 { + records[i] = append(records[i], "...") + } + // Create the final string + str += strings.Join(records[i], " ") + str += "\n" + } + if shortCols && len(notShowing) != 0 { + var notShown string + var notShownArr [][]string + cum := 0 + i := 0 + for n, ns := range notShowing { + cum += len(ns) + if cum > maxCharsTotal { + notShownArr = append(notShownArr, notShowing[i:n]) + cum = 0 + i = n + } + } + if i < len(notShowing) { + notShownArr = append(notShownArr, notShowing[i:]) + } + for k, ns := range notShownArr { + notShown += strings.Join(ns, ", ") + if k != len(notShownArr)-1 { + notShown += "," + } + notShown += "\n" + } + str += fmt.Sprintf("\nNot Showing: %s", notShown) + } + return str +} diff --git a/series.go b/series.go index 010849296b3a05b9dc8d598e3763eecfaf84f5ef..94bd51acd1952ea870e45bb22d85bed04b3cc364 100644 --- a/series.go +++ b/series.go @@ -2,7 +2,6 @@ package pandas import ( "fmt" - "github.com/google/go-cmp/cmp" "math" "reflect" ) @@ -135,21 +134,6 @@ func GenericSeries[T GenericType](name string, values ...T) *Series { return NewSeries(_type, name, values) } -// DefaultIsEqualFunc is the default comparitor to determine if -// two values in the series are the same. -func DefaultIsEqualFunc(a, b interface{}) bool { - return cmp.Equal(a, b) -} - -// DefaultFormatter will return a string representation -// of the data in a particular row. -func DefaultFormatter(v interface{}) string { - if v == nil { - return StringNaN - } - return fmt.Sprintf("%v", v) -} - func detectTypes[T GenericType](v T) (Type, any) { var _type = SERIES_TYPE_STRING vv := reflect.ValueOf(v) diff --git a/series_defaults.go b/series_defaults.go new file mode 100644 index 0000000000000000000000000000000000000000..c8a315ea6d49f9c3a6332fadedbac98727f09a8f --- /dev/null +++ b/series_defaults.go @@ -0,0 +1,21 @@ +package pandas + +import ( + "fmt" + "github.com/google/go-cmp/cmp" +) + +// DefaultIsEqualFunc is the default comparitor to determine if +// two values in the series are the same. +func DefaultIsEqualFunc(a, b interface{}) bool { + return cmp.Equal(a, b) +} + +// DefaultFormatter will return a string representation +// of the data in a particular row. +func DefaultFormatter(v interface{}) string { + if v == nil { + return StringNaN + } + return fmt.Sprintf("%v", v) +} diff --git a/series_float64_test.go b/series_float64_test.go index 371c9fb3cc30b1bc4679f5c8f58c27efca0a3906..2d918b5c1f0c0475cedc6ec98d019e5079ada0e6 100644 --- a/series_float64_test.go +++ b/series_float64_test.go @@ -2,6 +2,7 @@ package pandas import ( "fmt" + "gitee.com/quant1x/pandas/stat" "testing" ) @@ -44,3 +45,13 @@ func TestNewSeriesFloat64(t *testing.T) { e5 := s5.EWM(EW{Alpha: 1 / 5.0, Adjust: false}).Mean() fmt.Println(e5) } + +func TestSeriesWhere(t *testing.T) { + c1 := []float64{1, 0, 3, 4, 5, 6, 7, 8, 9, 10} + d1 := []float64{10, 11, 12, 13, 14, 15, 16, 17, 18, 19} + d2 := []float64{20, 21, 22, 23, 24, 25, 26, 27, 28, 29} + + w1 := stat.Where(c1, d1, d2) + fmt.Println(w1) + +} diff --git a/stat/align.go b/stat/align.go new file mode 100644 index 0000000000000000000000000000000000000000..b84e12690fcf988fbbfee1e3585b78ede2706591 --- /dev/null +++ b/stat/align.go @@ -0,0 +1,20 @@ +package stat + +// Data alignment +func align[T StatType](x []T, a T, dLen int) []T { + d := []T{} + xLen := len(x) + if xLen >= dLen { + // 截断 + copy(d, x[0:dLen]) + } else { + // 扩展内存 + d = make([]T, dLen) + copy(d, x) + //avx2.RepeatAll(d[xLen:], a) + for i := xLen; i < dLen; i++ { + d[i] = a + } + } + return d +} diff --git a/stat/gonum.go b/stat/gonum.go new file mode 100644 index 0000000000000000000000000000000000000000..570aa5273a4cbb8bcfcdd62a27b9f21c82df63ca --- /dev/null +++ b/stat/gonum.go @@ -0,0 +1,10 @@ +package stat + +import "gonum.org/v1/gonum/stat" + +// 这一组功能里面会收敛一部分gonum.org/v1/gonum (https://github.com/gonum/gonum.git)的功能 + +// LinearRegression 线性回归 +func LinearRegression(x, y, weights []float64, origin bool) (alpha, beta float64) { + return stat.LinearRegression(x, y, weights, origin) +} diff --git a/stat/params.go b/stat/params.go new file mode 100644 index 0000000000000000000000000000000000000000..826450bc1b6e194a61d6af8ae835cac9358d43e2 --- /dev/null +++ b/stat/params.go @@ -0,0 +1,13 @@ +package stat + +func detectParam[T StatType](v any) (T, []T, error) { + var base T + var slice []T + switch val := v.(type) { + case []T: + slice = val + case T: + base = val + } + return base, slice, nil +} diff --git a/stat/type.go b/stat/type.go new file mode 100644 index 0000000000000000000000000000000000000000..12293572243b0c24ebf08d7b1227accd394dbb73 --- /dev/null +++ b/stat/type.go @@ -0,0 +1,23 @@ +package stat + +import ( + "math" + "reflect" +) + +type StatType interface { + ~int32 | ~int64 | ~float32 | ~float64 +} + +// 随便输入一个什么值 +func typeDefault[T StatType](x T) T { + xv := reflect.ValueOf(x) + xk := xv.Kind() + switch xk { + case reflect.Int32, reflect.Int64: + return T(0) + case reflect.Float32, reflect.Float64: + return T(math.NaN()) + } + return T(0) +} diff --git a/stat/where.go b/stat/where.go new file mode 100644 index 0000000000000000000000000000000000000000..929cd3e3a7eb776afa974d1d9cd12c7c5dcbe9ea --- /dev/null +++ b/stat/where.go @@ -0,0 +1,86 @@ +package stat + +import ( + "gitee.com/quant1x/pandas/algorithms/avx2" + "math" +) + +//func Where[T int64 | float64](condition any, params ...any) []T { +// c1, c2, err := detectParam[float64](condition) +// if err != nil { +// return []T{} +// } +// +//} + +// Where 返回根据“条件”从“x”或“y”中选择的元素 +// 这里先实现一个简单的, 留给于总重构 +// params只支持两个默认值x和y, 如果condition为true返回x, 否则返回y +// condition和param都可能是基础数据类型,也可能是一个slice, 并且长度可能不一致 +// 直接写成序列版本, 可能更简单 +// func Where[T int64 | float64](condition []T, x, y []T) []T { +func Where1(condition []float64, x, y []float64) []float64 { + // 第一步, 找出最长的 + clen := len(condition) + xlen := len(x) + ylen := len(y) + // 第二步, 找出最大长度 + c := []float64{float64(clen), float64(xlen), float64(ylen)} + maxLength := int(avx2.Max(c)) + + // 对齐所有长度 + if clen < maxLength { + condition = align(condition, math.NaN(), maxLength) + } + if xlen < maxLength { + x = align(x, math.NaN(), maxLength) + } + if ylen < maxLength { + y = align(y, math.NaN(), maxLength) + } + // 初始化返回值 + d := make([]float64, maxLength) + for i := 0; i < maxLength; i++ { + // NaN也被认为是真 + if condition[i] != 0 { + d[i] = x[i] + } else { + d[i] = y[i] + } + } + return d +} + +func Where[T StatType](condition []T, x, y []T) []T { + // 第一步, 找出最长的 + clen := len(condition) + xlen := len(x) + ylen := len(y) + // 第二步, 找出最大长度 + c := []float64{float64(clen), float64(xlen), float64(ylen)} + maxLength := int(avx2.Max(c)) + + // 处理默认值 + defaultValue := typeDefault(T(0)) + // 对齐所有长度 + if clen < maxLength { + condition = align(condition, defaultValue, maxLength) + } + if xlen < maxLength { + x = align(x, defaultValue, maxLength) + } + if ylen < maxLength { + y = align(y, defaultValue, maxLength) + } + // 初始化返回值 + d := make([]T, maxLength) + for i := 0; i < maxLength; i++ { + // NaN也被认为是真 + if condition[i] != 0 { + d[i] = x[i] + } else { + d[i] = y[i] + } + } + return d +} diff --git a/stat/where_test.go b/stat/where_test.go new file mode 100644 index 0000000000000000000000000000000000000000..275faf42a3dc52ddbbe377d63f90564d266a1722 --- /dev/null +++ b/stat/where_test.go @@ -0,0 +1,55 @@ +package stat + +import ( + "gitee.com/quant1x/pandas/algorithms" + "math" + "testing" +) + +func TestWhere(t *testing.T) { + type args struct { + condition []float64 + x []float64 + y []float64 + } + tests := []struct { + name string + args args + want []float64 + }{ + { + name: "t01", + args: args{ + condition: []float64{1, 1, 1}, + x: []float64{0.1, 0.2, 0.3}, + y: []float64{1.1, 1.2, 1.3}, + }, + want: []float64{0.1, 0.2, 0.3}, + }, + { + name: "t02", + args: args{ + condition: []float64{1, 0}, + x: []float64{0.1, 0.2, 0.3}, + y: []float64{1.1, 1.2, 1.3}, + }, + want: []float64{0.1, 1.2, 0.3}, + }, + { + name: "t03", + args: args{ + condition: []float64{1, 0}, + x: []float64{0.1, 0.2}, + y: []float64{1.1, math.NaN(), 1.3}, + }, + want: []float64{0.1, math.NaN(), math.NaN()}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := Where(tt.args.condition, tt.args.x, tt.args.y); !algorithms.SliceWantFloat(got, tt.want) { + t.Errorf("Where() = %v, want %v", got, tt.want) + } + }) + } +}