v2.0.11

分支 (286)

标签 (523)

管理

管理

master

release-8.1

bot/update-owners-1746611352739

release-8.5

feature/release-8.1-gsort-test

release-6.5

release-7.1

release-7.5

release-7.5-20250423-v7.5.6

release-6.5-20241009-v6.5.10

release-7.5-20250219-v7.5.5

feature/fts

release-8.5-20250401-v8.5.1

release-7.1-20250327-v7.1.6

release-6.5-20241112-v6.5.11

dedicated

release-9.0-beta.1

release-8.5-20250310-v8.5.1-1

release-7.5-20250306-v7.5.5-1

release-8.1-20250226-v8.1.2

v7.5.6-20250423-3f36b27

v6.5.10-20250416-ac6550b

v7.5.5-20250415-6ffea7d

v8.5.1-20250402-9b92a06

v7.1.6-20250327-c0f3a11

v6.5.11-20250327-ef40301

v9.0.0-beta.1

v7.5.6

v8.5.1-20250311-a5d0e02

v7.5.5-20250306-ebd2868

v8.1.2-20250228-89762e2

v6.5.12

v8.1.2-20250226-2a5fc3e

v6.5.11-20250220-ae254ca

v7.5.5-20250219-d8d4e74

v7.5.5-20250301-d8d4e74

v6.5.11-20250123-7683911

v6.1.7-20250123-4446e81

v8.5.0-20250121-04a27be

v8.5.0-20250117-f069f0b

tidb
/
statistics
/
selectivity.go

// Copyright 2017 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.

package statistics

import (
	"math"

	"github.com/juju/errors"
	"github.com/pingcap/tidb/ast"
	"github.com/pingcap/tidb/expression"
	"github.com/pingcap/tidb/mysql"
	"github.com/pingcap/tidb/sessionctx"
	"github.com/pingcap/tidb/util/ranger"
)

// If one condition can't be calculated, we will assume that the selectivity of this condition is 0.8.
const selectionFactor = 0.8

// exprSet is used for calculating selectivity.
type exprSet struct {
	tp int
	ID int64
	// mask is a bit pattern whose ith bit will indicate whether the ith expression is covered by this index/column.
	mask int64
	// ranges contains all the ranges we got.
	ranges []*ranger.NewRange
}

// The type of the exprSet.
const (
	indexType = iota
	pkType
	colType
)

const unknownColumnID = math.MinInt64

// getConstantColumnID receives two expressions and if one of them is column and another is constant, it returns the
// ID of the column.
func getConstantColumnID(e []expression.Expression) int64 {
	if len(e) != 2 {
		return unknownColumnID
	}
	col, ok1 := e[0].(*expression.Column)
	_, ok2 := e[1].(*expression.Constant)
	if ok1 && ok2 {
		return col.ID
	}
	col, ok1 = e[1].(*expression.Column)
	_, ok2 = e[0].(*expression.Constant)
	if ok1 && ok2 {
		return col.ID
	}
	return unknownColumnID
}

func pseudoSelectivity(t *Table, exprs []expression.Expression) float64 {
	minFactor := selectionFactor
	uniqueCol := make(map[string]bool)
	for _, expr := range exprs {
		fun, ok := expr.(*expression.ScalarFunction)
		if !ok {
			continue
		}
		colID := getConstantColumnID(fun.GetArgs())
		if colID == unknownColumnID {
			continue
		}
		switch fun.FuncName.L {
		case ast.EQ, ast.NullEQ, ast.In:
			minFactor = math.Min(minFactor, 1.0/pseudoEqualRate)
			col, ok := t.Columns[colID]
			if !ok {
				continue
			}
			if mysql.HasUniKeyFlag(col.Info.Flag) {
				uniqueCol[col.Info.Name.L] = true
			}
			if mysql.HasPriKeyFlag(col.Info.Flag) {
				if t.PKIsHandle {
					return 1.0 / float64(t.Count)
				}
				uniqueCol[col.Info.Name.L] = true
			}
		case ast.GE, ast.GT, ast.LE, ast.LT:
			minFactor = math.Min(minFactor, 1.0/pseudoLessRate)
			// FIXME: To resolve the between case.
		}
	}
	if len(uniqueCol) == 0 {
		return minFactor
	}
	// use the unique key info
	for _, idx := range t.Indices {
		if !idx.Info.Unique {
			continue
		}
		unique := true
		for _, col := range idx.Info.Columns {
			if !uniqueCol[col.Name.L] {
				unique = false
				break
			}
		}
		if unique {
			return 1.0 / float64(t.Count)
		}
	}
	return minFactor
}

// Selectivity is a function calculate the selectivity of the expressions.
// The definition of selectivity is (row count after filter / row count before filter).
// And exprs must be CNF now, in other words, `exprs[0] and exprs[1] and ... and exprs[len - 1]` should be held when you call this.
// TODO: support expressions that the top layer is a DNF.
// Currently the time complexity is o(n^2).
func (t *Table) Selectivity(ctx sessionctx.Context, exprs []expression.Expression) (float64, error) {
	// If table's count is zero or conditions are empty, we should return 100% selectivity.
	if t.Count == 0 || len(exprs) == 0 {
		return 1, nil
	}
	// TODO: If len(exprs) is bigger than 63, we could use bitset structure to replace the int64.
	// This will simplify some code and speed up if we use this rather than a boolean slice.
	if t.Pseudo || len(exprs) > 63 || (len(t.Columns) == 0 && len(t.Indices) == 0) {
		return pseudoSelectivity(t, exprs), nil
	}
	var sets []*exprSet
	sc := ctx.GetSessionVars().StmtCtx

	extractedCols := make([]*expression.Column, 0, len(t.Columns))
	extractedCols = expression.ExtractColumnsFromExpressions(extractedCols, exprs, nil)
	for _, colInfo := range t.Columns {
		col := expression.ColInfo2Col(extractedCols, colInfo.Info)
		// This column should have histogram.
		if col != nil && !t.ColumnIsInvalid(ctx.GetSessionVars().StmtCtx, col.ID) {
			maskCovered, ranges, err := getMaskAndRanges(ctx, exprs, ranger.ColumnRangeType, nil, col)
			if err != nil {
				return 0, errors.Trace(err)
			}
			sets = append(sets, &exprSet{tp: colType, ID: col.ID, mask: maskCovered, ranges: ranges})
			if colInfo.isHandle {
				sets[len(sets)-1].tp = pkType
			}
		}
	}
	for _, idxInfo := range t.Indices {
		idxCols, lengths := expression.IndexInfo2Cols(extractedCols, idxInfo.Info)
		// This index should have histogram.
		if len(idxCols) > 0 && idxInfo.Histogram.Len() > 0 {
			maskCovered, ranges, err := getMaskAndRanges(ctx, exprs, ranger.IndexRangeType, lengths, idxCols...)
			if err != nil {
				return 0, errors.Trace(err)
			}
			sets = append(sets, &exprSet{tp: indexType, ID: idxInfo.ID, mask: maskCovered, ranges: ranges})
		}
	}
	sets = getUsableSetsByGreedy(sets)
	ret := 1.0
	// Initialize the mask with the full set.
	mask := (int64(1) << uint(len(exprs))) - 1
	for _, set := range sets {
		mask ^= set.mask
		var (
			rowCount float64
			err      error
		)
		switch set.tp {
		case pkType, colType:
			rowCount, err = t.GetRowCountByColumnRanges(sc, set.ID, set.ranges)
		case indexType:
			rowCount, err = t.GetRowCountByIndexRanges(sc, set.ID, set.ranges)
		}
		if err != nil {
			return 0, errors.Trace(err)
		}
		ret *= rowCount / float64(t.Count)
	}
	// If there's still conditions which cannot be calculated, we will multiply a selectionFactor.
	if mask > 0 {
		ret *= selectionFactor
	}
	return ret, nil
}

func getMaskAndRanges(ctx sessionctx.Context, exprs []expression.Expression, rangeType ranger.RangeType,
	lengths []int, cols ...*expression.Column) (mask int64, ranges []*ranger.NewRange, err error) {
	sc := ctx.GetSessionVars().StmtCtx
	var accessConds []expression.Expression
	switch rangeType {
	case ranger.ColumnRangeType:
		accessConds = ranger.ExtractAccessConditionsForColumn(exprs, cols[0].ColName)
		ranges, err = ranger.BuildColumnRange(accessConds, sc, cols[0].RetType)
	case ranger.IndexRangeType:
		ranges, accessConds, err = ranger.DetachSimpleCondAndBuildRangeForIndex(ctx, exprs, cols, lengths)
	default:
		panic("should never be here")
	}
	if err != nil {
		return 0, nil, errors.Trace(err)
	}
	for i := range exprs {
		for j := range accessConds {
			if exprs[i].Equal(ctx, accessConds[j]) {
				mask |= 1 << uint64(i)
				break
			}
		}
	}
	return mask, ranges, nil
}

// getUsableSetsByGreedy will select the indices and pk used for calculate selectivity by greedy algorithm.
func getUsableSetsByGreedy(sets []*exprSet) (newBlocks []*exprSet) {
	mask := int64(math.MaxInt64)
	for {
		// Choose the index that covers most.
		bestID, bestCount, bestTp := -1, 0, colType
		for i, set := range sets {
			set.mask &= mask
			bits := popCount(set.mask)
			// This set cannot cover any thing, just skip it.
			if bits == 0 {
				continue
			}
			if (bestTp == colType && set.tp < colType) || bestCount < bits {
				bestID, bestCount, bestTp = i, bits, set.tp
			}
		}
		if bestCount == 0 {
			break
		} else {
			// update the mask, remove the bit that sets[bestID].mask has.
			mask &^= sets[bestID].mask

			newBlocks = append(newBlocks, sets[bestID])
			// remove the chosen one
			sets = append(sets[:bestID], sets[bestID+1:]...)
		}
	}
	return
}

// popCount is the digit sum of the binary representation of the number x.
func popCount(x int64) int {
	ret := 0
	// x -= x & -x, remove the lowest bit of the x.
	// e.g. result will be 2 if x is 3.
	for ; x > 0; x -= x & -x {
		ret++
	}
	return ret
}