diff --git a/contrib/pgvector/CHANGELOG.md b/contrib/pgvector/CHANGELOG.md new file mode 100644 index 0000000000000000000000000000000000000000..794416670440f67b59836bdd90f95ead4471eccc --- /dev/null +++ b/contrib/pgvector/CHANGELOG.md @@ -0,0 +1,132 @@ +## 0.4.4 (2023-06-12) + +- Improved error message for malformed vector literal +- Fixed segmentation fault with text input +- Fixed consecutive delimiters with text input + +## 0.4.3 (2023-06-10) + +- Improved cost estimation +- Improved support for spaces with text input +- Fixed infinite and NaN values with binary input +- Fixed infinite values with vector addition and subtraction +- Fixed infinite values with list centers +- Fixed compilation error when `float8` is pass by reference +- Fixed compilation error on PowerPC +- Fixed segmentation fault with index creation on i386 + +## 0.4.2 (2023-05-13) + +- Added notice when index created with little data +- Fixed dimensions check for some direct function calls +- Fixed installation error with Postgres 12.0-12.2 + +## 0.4.1 (2023-03-21) + +- Improved performance of cosine distance +- Fixed index scan count + +## 0.4.0 (2023-01-11) + +If upgrading with Postgres < 13, see [this note](https://github.com/pgvector/pgvector#040). + +- Changed text representation for vector elements to match `real` +- Changed storage for vector from `plain` to `extended` +- Increased max dimensions for vector from 1024 to 16000 +- Increased max dimensions for index from 1024 to 2000 +- Improved accuracy of text parsing for certain inputs +- Added `avg` aggregate for vector +- Added experimental support for Windows +- Dropped support for Postgres 10 + +## 0.3.2 (2022-11-22) + +- Fixed `invalid memory alloc request size` error + +## 0.3.1 (2022-11-02) + +If upgrading from 0.2.7 or 0.3.0, [recreate](https://github.com/pgvector/pgvector#031) all `ivfflat` indexes after upgrading to ensure all data is indexed. + +- Fixed issue with inserts silently corrupting `ivfflat` indexes (introduced in 0.2.7) +- Fixed segmentation fault with index creation when lists > 6500 + +## 0.3.0 (2022-10-15) + +- Added support for Postgres 15 +- Dropped support for Postgres 9.6 + +## 0.2.7 (2022-07-31) + +- Fixed `unexpected data beyond EOF` error + +## 0.2.6 (2022-05-22) + +- Improved performance of index creation for Postgres < 12 + +## 0.2.5 (2022-02-11) + +- Reduced memory usage during index creation +- Fixed index creation exceeding `maintenance_work_mem` +- Fixed error with index creation when lists > 1600 + +## 0.2.4 (2022-02-06) + +- Added support for parallel vacuum +- Fixed issue with index not reusing space + +## 0.2.3 (2022-01-30) + +- Added indexing progress for Postgres 12+ +- Improved interrupt handling during index creation + +## 0.2.2 (2022-01-15) + +- Fixed compilation error on Mac ARM + +## 0.2.1 (2022-01-02) + +- Fixed `operator is not unique` error + +## 0.2.0 (2021-10-03) + +- Added support for Postgres 14 + +## 0.1.8 (2021-09-07) + +- Added cast for `vector` to `real[]` + +## 0.1.7 (2021-06-13) + +- Added cast for `numeric[]` to `vector` + +## 0.1.6 (2021-06-09) + +- Fixed segmentation fault with `COUNT` + +## 0.1.5 (2021-05-25) + +- Reduced memory usage during index creation + +## 0.1.4 (2021-05-09) + +- Fixed kmeans for inner product +- Fixed multiple definition error with GCC 10 + +## 0.1.3 (2021-05-06) + +- Added Dockerfile +- Fixed version + +## 0.1.2 (2021-04-26) + +- Vectorized distance calculations +- Improved cost estimation + +## 0.1.1 (2021-04-25) + +- Added binary representation for `COPY` +- Marked functions as `PARALLEL SAFE` + +## 0.1.0 (2021-04-20) + +- First release diff --git a/contrib/pgvector/Dockerfile b/contrib/pgvector/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..6fe309954fa49fd8d26c61ba937681c743efbd48 --- /dev/null +++ b/contrib/pgvector/Dockerfile @@ -0,0 +1,18 @@ +ARG PG_MAJOR=15 +FROM postgres:$PG_MAJOR +ARG PG_MAJOR + +COPY . /tmp/pgvector + +RUN apt-get update && \ + apt-get install -y --no-install-recommends build-essential postgresql-server-dev-$PG_MAJOR && \ + cd /tmp/pgvector && \ + make clean && \ + make OPTFLAGS="" && \ + make install && \ + mkdir /usr/share/doc/pgvector && \ + cp LICENSE README.md /usr/share/doc/pgvector && \ + rm -r /tmp/pgvector && \ + apt-get remove -y build-essential postgresql-server-dev-$PG_MAJOR && \ + apt-get autoremove -y && \ + rm -rf /var/lib/apt/lists/* diff --git a/contrib/pgvector/LICENSE b/contrib/pgvector/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..483e2b9ae90f5d0a749176fc5f07bd277507901c --- /dev/null +++ b/contrib/pgvector/LICENSE @@ -0,0 +1,20 @@ +Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + +Portions Copyright (c) 1994, The Regents of the University of California + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose, without fee, and without a written agreement +is hereby granted, provided that the above copyright notice and this +paragraph and the following two paragraphs appear in all copies. + +IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR +DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING +LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS +DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, +INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY +AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS +ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO +PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. diff --git a/contrib/pgvector/META.json b/contrib/pgvector/META.json new file mode 100644 index 0000000000000000000000000000000000000000..a71d81086cd7a2cdc94df3293f5070680cd0776a --- /dev/null +++ b/contrib/pgvector/META.json @@ -0,0 +1,49 @@ +{ + "name": "vector", + "abstract": "Open-source vector similarity search for Postgres", + "description": "Supports L2 distance, inner product, and cosine distance", + "version": "0.4.4", + "maintainer": [ + "Andrew Kane " + ], + "license": { + "PostgreSQL": "http://www.postgresql.org/about/licence" + }, + "prereqs": { + "runtime": { + "requires": { + "PostgreSQL": "11.0.0" + } + } + }, + "provides": { + "vector": { + "file": "sql/vector.sql", + "docfile": "README.md", + "version": "0.4.4", + "abstract": "Open-source vector similarity search for Postgres" + } + }, + "resources": { + "homepage": "https://github.com/pgvector/pgvector", + "bugtracker": { + "web": "https://github.com/pgvector/pgvector/issues" + }, + "repository": { + "url": "https://github.com/pgvector/pgvector.git", + "web": "https://github.com/pgvector/pgvector", + "type": "git" + } + }, + "generated_by": "Andrew Kane", + "meta-spec": { + "version": "1.0.0", + "url": "http://pgxn.org/meta/spec.txt" + }, + "tags": [ + "vectors", + "datatype", + "nearest neighbor search", + "approximate nearest neighbors" + ] +} diff --git a/contrib/pgvector/Makefile b/contrib/pgvector/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..ff26f56cf41ac8ced78a267071519b0c8760ca71 --- /dev/null +++ b/contrib/pgvector/Makefile @@ -0,0 +1,76 @@ +EXTENSION = vector +EXTVERSION = 0.4.4 + +MODULE_big = vector +DATA = $(wildcard sql/*--*.sql) +OBJS = src/ivfbuild.o src/ivfflat.o src/ivfinsert.o src/ivfkmeans.o src/ivfscan.o src/ivfutils.o src/ivfvacuum.o src/vector.o + +TESTS = $(wildcard test/sql/*.sql) +REGRESS = $(patsubst test/sql/%.sql,%,$(TESTS)) +REGRESS_OPTS = --inputdir=test --load-extension=vector + +OPTFLAGS = -march=native + +# Mac ARM doesn't support -march=native +ifeq ($(shell uname -s), Darwin) + ifeq ($(shell uname -p), arm) + # no difference with -march=armv8.5-a + OPTFLAGS = + endif +endif + +# PowerPC doesn't support -march=native +ifneq ($(filter ppc64%, $(shell uname -m)), ) + OPTFLAGS = +endif + +# For auto-vectorization: +# - GCC (needs -ftree-vectorize OR -O3) - https://gcc.gnu.org/projects/tree-ssa/vectorization.html +# - Clang (could use pragma instead) - https://llvm.org/docs/Vectorizers.html +PG_CFLAGS += $(OPTFLAGS) -ftree-vectorize -fassociative-math -fno-signed-zeros -fno-trapping-math + +# Debug GCC auto-vectorization +# PG_CFLAGS += -fopt-info-vec + +# Debug Clang auto-vectorization +# PG_CFLAGS += -Rpass=loop-vectorize -Rpass-analysis=loop-vectorize + +all: sql/$(EXTENSION)--$(EXTVERSION).sql + +sql/$(EXTENSION)--$(EXTVERSION).sql: sql/$(EXTENSION).sql + cp $< $@ + +EXTRA_CLEAN = sql/$(EXTENSION)--$(EXTVERSION).sql + +PG_CONFIG ?= pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) + +# for Mac +ifeq ($(PROVE),) + PROVE = prove +endif + +# for Postgres 15 +PROVE_FLAGS += -I ./test/perl + +prove_installcheck: + rm -rf $(CURDIR)/tmp_check + cd $(srcdir) && TESTDIR='$(CURDIR)' PATH="$(bindir):$$PATH" PGPORT='6$(DEF_PGPORT)' PG_REGRESS='$(top_builddir)/src/test/regress/pg_regress' $(PROVE) $(PG_PROVE_FLAGS) $(PROVE_FLAGS) $(if $(PROVE_TESTS),$(PROVE_TESTS),test/t/*.pl) + +.PHONY: dist + +dist: + mkdir -p dist + git archive --format zip --prefix=$(EXTENSION)-$(EXTVERSION)/ --output dist/$(EXTENSION)-$(EXTVERSION).zip master + +.PHONY: docker + +docker: + docker build --pull --no-cache --platform linux/amd64 -t ankane/pgvector:latest . + +.PHONY: docker-release + +docker-release: + docker buildx build --push --pull --no-cache --platform linux/amd64,linux/arm64 -t ankane/pgvector:latest . + docker buildx build --push --platform linux/amd64,linux/arm64 -t ankane/pgvector:v$(EXTVERSION) . diff --git a/contrib/pgvector/Makefile.win b/contrib/pgvector/Makefile.win new file mode 100644 index 0000000000000000000000000000000000000000..8ceb572470df159a24a28df8ca752de06905c0fa --- /dev/null +++ b/contrib/pgvector/Makefile.win @@ -0,0 +1,70 @@ +EXTENSION = vector +EXTVERSION = 0.4.4 + +OBJS = src\ivfbuild.obj src\ivfflat.obj src\ivfinsert.obj src\ivfkmeans.obj src\ivfscan.obj src\ivfutils.obj src\ivfvacuum.obj src\vector.obj + +REGRESS = btree cast copy functions input ivfflat_cosine ivfflat_ip ivfflat_l2 ivfflat_options ivfflat_unlogged +REGRESS_OPTS = --inputdir=test --load-extension=vector + +# For /arch flags +# https://learn.microsoft.com/en-us/cpp/build/reference/arch-minimum-cpu-architecture +OPTFLAGS = + +# For auto-vectorization: +# - MSVC (needs /O2 /fp:fast) - https://learn.microsoft.com/en-us/cpp/parallel/auto-parallelization-and-auto-vectorization?#auto-vectorizer +PG_CFLAGS = $(PG_CFLAGS) $(OPTFLAGS) /O2 /fp:fast + +# Debug MSVC auto-vectorization +# https://learn.microsoft.com/en-us/cpp/error-messages/tool-errors/vectorizer-and-parallelizer-messages +# PG_CFLAGS = $(PG_CFLAGS) /Qvec-report:2 + +all: sql\$(EXTENSION)--$(EXTVERSION).sql + +sql\$(EXTENSION)--$(EXTVERSION).sql: sql\$(EXTENSION).sql + copy sql\$(EXTENSION).sql $@ + +# TODO use pg_config +!ifndef PGROOT +!error PGROOT is not set +!endif +BINDIR = $(PGROOT)\bin +INCLUDEDIR = $(PGROOT)\include +INCLUDEDIR_SERVER = $(PGROOT)\include\server +LIBDIR = $(PGROOT)\lib +PKGLIBDIR = $(PGROOT)\lib +SHAREDIR = $(PGROOT)\share + +CFLAGS = /nologo /I"$(INCLUDEDIR_SERVER)\port\win32_msvc" /I"$(INCLUDEDIR_SERVER)\port\win32" /I"$(INCLUDEDIR_SERVER)" /I"$(INCLUDEDIR)" + +CFLAGS = $(CFLAGS) $(PG_CFLAGS) + +SHLIB = $(EXTENSION).dll + +LIBS = "$(LIBDIR)\postgres.lib" + +.c.obj: + $(CC) $(CFLAGS) /c $< /Fo$@ + +$(SHLIB): $(OBJS) + $(CC) $(CFLAGS) $(OBJS) $(LIBS) /link /DLL /OUT:$(SHLIB) + +all: $(SHLIB) + +install: + copy $(SHLIB) "$(PKGLIBDIR)" + copy $(EXTENSION).control "$(SHAREDIR)\extension" + copy sql\$(EXTENSION)--*.sql "$(SHAREDIR)\extension" + +installcheck: + "$(BINDIR)\pg_regress" --bindir="$(BINDIR)" $(REGRESS_OPTS) $(REGRESS) + +uninstall: + del /f "$(PKGLIBDIR)\$(SHLIB)" + del /f "$(SHAREDIR)\extension\$(EXTENSION).control" + del /f "$(SHAREDIR)\extension\vector--*.sql" + +clean: + del /f $(SHLIB) $(EXTENSION).lib $(EXTENSION).exp + del /f $(OBJS) + del /f sql\$(EXTENSION)--$(EXTVERSION).sql + del /f /s /q results regression.diffs regression.out tmp_check tmp_check_iso log output_iso diff --git a/contrib/pgvector/README.md b/contrib/pgvector/README.md new file mode 100644 index 0000000000000000000000000000000000000000..be02c961efebc249b02a965284ee238c172c0f06 --- /dev/null +++ b/contrib/pgvector/README.md @@ -0,0 +1,580 @@ +# pgvector + +Open-source vector similarity search for Postgres + +Supports + +- exact and approximate nearest neighbor search +- L2 distance, inner product, and cosine distance +- any [language](#languages) with a Postgres client + +Plus [ACID](https://en.wikipedia.org/wiki/ACID) compliance, point-in-time recovery, JOINs, and all of the other [great features](https://www.postgresql.org/about/) of Postgres + +[![Build Status](https://github.com/pgvector/pgvector/workflows/build/badge.svg?branch=master)](https://github.com/pgvector/pgvector/actions) + +## Installation + +Compile and install the extension (supports Postgres 11+) + +```sh +cd /tmp +git clone --branch v0.4.4 https://github.com/pgvector/pgvector.git +cd pgvector +make +make install # may need sudo +``` + +See the [installation notes](#installation-notes) if you run into issues + +You can also install it with [Docker](#docker), [Homebrew](#homebrew), [PGXN](#pgxn), [APT](#apt), [Yum](#yum), or [conda-forge](#conda-forge), and it comes preinstalled with [Postgres.app](#postgresapp) and many [hosted providers](#hosted-postgres) + +## Getting Started + +Enable the extension (do this once in each database where you want to use it) + +```tsql +CREATE EXTENSION vector; +``` + +Create a vector column with 3 dimensions + +```sql +CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3)); +``` + +Insert vectors + +```sql +INSERT INTO items (embedding) VALUES ('[1,2,3]'), ('[4,5,6]'); +``` + +Get the nearest neighbors by L2 distance + +```sql +SELECT * FROM items ORDER BY embedding <-> '[3,1,2]' LIMIT 5; +``` + +Also supports inner product (`<#>`) and cosine distance (`<=>`) + +Note: `<#>` returns the negative inner product since Postgres only supports `ASC` order index scans on operators + +## Storing + +Create a new table with a vector column + +```sql +CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3)); +``` + +Or add a vector column to an existing table + +```sql +ALTER TABLE items ADD COLUMN embedding vector(3); +``` + +Insert vectors + +```sql +INSERT INTO items (embedding) VALUES ('[1,2,3]'), ('[4,5,6]'); +``` + +Upsert vectors + +```sql +INSERT INTO items (id, embedding) VALUES (1, '[1,2,3]'), (2, '[4,5,6]') + ON CONFLICT (id) DO UPDATE SET embedding = EXCLUDED.embedding; +``` + +Update vectors + +```sql +UPDATE items SET embedding = '[1,2,3]' WHERE id = 1; +``` + +Delete vectors + +```sql +DELETE FROM items WHERE id = 1; +``` + +## Querying + +Get the nearest neighbors to a vector + +```sql +SELECT * FROM items ORDER BY embedding <-> '[3,1,2]' LIMIT 5; +``` + +Get the nearest neighbors to a row + +```sql +SELECT * FROM items WHERE id != 1 ORDER BY embedding <-> (SELECT embedding FROM items WHERE id = 1) LIMIT 5; +``` + +Get rows within a certain distance + +```sql +SELECT * FROM items WHERE embedding <-> '[3,1,2]' < 5; +``` + +Note: Combine with `ORDER BY` and `LIMIT` to use an index + +#### Distances + +Get the distance + +```sql +SELECT embedding <-> '[3,1,2]' AS distance FROM items; +``` + +For inner product, multiply by -1 (since `<#>` returns the negative inner product) + +```tsql +SELECT (embedding <#> '[3,1,2]') * -1 AS inner_product FROM items; +``` + +For cosine similarity, use 1 - cosine distance + +```sql +SELECT 1 - (embedding <=> '[3,1,2]') AS cosine_similarity FROM items; +``` + +#### Aggregates + +Average vectors + +```sql +SELECT AVG(embedding) FROM items; +``` + +Average groups of vectors + +```sql +SELECT category_id, AVG(embedding) FROM items GROUP BY category_id; +``` + +## Indexing + +By default, pgvector performs exact nearest neighbor search, which provides perfect recall. + +You can add an index to use approximate nearest neighbor search, which trades some recall for performance. Unlike typical indexes, you will see different results for queries after adding an approximate index. + +Three keys to achieving good recall are: + +1. Create the index *after* the table has some data +2. Choose an appropriate number of lists - a good place to start is `rows / 1000` for up to 1M rows and `sqrt(rows)` for over 1M rows +3. When querying, specify an appropriate number of [probes](#query-options) (higher is better for recall, lower is better for speed) - a good place to start is `sqrt(lists)` + +Add an index for each distance function you want to use. + +L2 distance + +```sql +CREATE INDEX ON items USING ivfflat (embedding vector_l2_ops) WITH (lists = 100); +``` + +Inner product + +```sql +CREATE INDEX ON items USING ivfflat (embedding vector_ip_ops) WITH (lists = 100); +``` + +Cosine distance + +```sql +CREATE INDEX ON items USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100); +``` + +Vectors with up to 2,000 dimensions can be indexed. + +### Query Options + +Specify the number of probes (1 by default) + +```sql +SET ivfflat.probes = 10; +``` + +A higher value provides better recall at the cost of speed, and it can be set to the number of lists for exact nearest neighbor search (at which point the planner won’t use the index) + +Use `SET LOCAL` inside a transaction to set it for a single query + +```sql +BEGIN; +SET LOCAL ivfflat.probes = 10; +SELECT ... +COMMIT; +``` + +### Indexing Progress + +Check [indexing progress](https://www.postgresql.org/docs/current/progress-reporting.html#CREATE-INDEX-PROGRESS-REPORTING) with Postgres 12+ + +```sql +SELECT phase, tuples_done, tuples_total FROM pg_stat_progress_create_index; +``` + +The phases are: + +1. `initializing` +2. `performing k-means` +3. `sorting tuples` +4. `loading tuples` + +Note: `tuples_done` and `tuples_total` are only populated during the `loading tuples` phase + +### Filtering + +There are a few ways to index nearest neighbor queries with a `WHERE` clause + +```sql +SELECT * FROM items WHERE category_id = 123 ORDER BY embedding <-> '[3,1,2]' LIMIT 5; +``` + +Create an index on one [or more](https://www.postgresql.org/docs/current/indexes-multicolumn.html) of the `WHERE` columns for exact search + +```sql +CREATE INDEX ON items (category_id); +``` + +Or a [partial index](https://www.postgresql.org/docs/current/indexes-partial.html) on the vector column for approximate search + +```sql +CREATE INDEX ON items USING ivfflat (embedding vector_l2_ops) WITH (lists = 100) + WHERE (category_id = 123); +``` + +Use [partitioning](https://www.postgresql.org/docs/current/ddl-partitioning.html) for approximate search on many different values of the `WHERE` columns + +```sql +CREATE TABLE items (embedding vector(3), category_id int) PARTITION BY LIST(category_id); +``` + +## Hybrid Search + +Use together with Postgres [full-text search](https://www.postgresql.org/docs/current/textsearch-intro.html) for hybrid search ([Python example](https://github.com/pgvector/pgvector-python/blob/master/examples/hybrid_search.py)). + +```sql +SELECT id, content FROM items, to_tsquery('hello & search') query + WHERE textsearch @@ query ORDER BY ts_rank_cd(textsearch, query) DESC LIMIT 5; +``` + +## Performance + +Use `EXPLAIN ANALYZE` to debug performance. + +```sql +EXPLAIN ANALYZE SELECT * FROM items ORDER BY embedding <-> '[3,1,2]' LIMIT 5; +``` + +### Exact Search + +To speed up queries without an index, increase `max_parallel_workers_per_gather`. + +```sql +SET max_parallel_workers_per_gather = 4; +``` + +If vectors are normalized to length 1 (like [OpenAI embeddings](https://platform.openai.com/docs/guides/embeddings/which-distance-function-should-i-use)), use inner product for best performance. + +```tsql +SELECT * FROM items ORDER BY embedding <#> '[3,1,2]' LIMIT 5; +``` + +### Approximate Search + +To speed up queries with an index, increase the number of inverted lists (at the expense of recall). + +```sql +CREATE INDEX ON items USING ivfflat (embedding vector_l2_ops) WITH (lists = 1000); +``` + +## Languages + +Use pgvector from any language with a Postgres client. You can even generate and store vectors in one language and query them in another. + +Language | Libraries / Examples +--- | --- +C++ | [pgvector-cpp](https://github.com/pgvector/pgvector-cpp) +C# | [pgvector-dotnet](https://github.com/pgvector/pgvector-dotnet) +Crystal | [pgvector-crystal](https://github.com/pgvector/pgvector-crystal) +Elixir | [pgvector-elixir](https://github.com/pgvector/pgvector-elixir) +Go | [pgvector-go](https://github.com/pgvector/pgvector-go) +Haskell | [pgvector-haskell](https://github.com/pgvector/pgvector-haskell) +Java, Scala | [pgvector-java](https://github.com/pgvector/pgvector-java) +Julia | [pgvector-julia](https://github.com/pgvector/pgvector-julia) +Lua | [pgvector-lua](https://github.com/pgvector/pgvector-lua) +Node.js | [pgvector-node](https://github.com/pgvector/pgvector-node) +Perl | [pgvector-perl](https://github.com/pgvector/pgvector-perl) +PHP | [pgvector-php](https://github.com/pgvector/pgvector-php) +Python | [pgvector-python](https://github.com/pgvector/pgvector-python) +R | [pgvector-r](https://github.com/pgvector/pgvector-r) +Ruby | [pgvector-ruby](https://github.com/pgvector/pgvector-ruby), [Neighbor](https://github.com/ankane/neighbor) +Rust | [pgvector-rust](https://github.com/pgvector/pgvector-rust) +Swift | [pgvector-swift](https://github.com/pgvector/pgvector-swift) + +## Frequently Asked Questions + +#### How many vectors can be stored in a single table? + +A non-partitioned table has a limit of 32 TB by default in Postgres. A partitioned table can have thousands of partitions of that size. + +#### Is replication supported? + +Yes, pgvector uses the write-ahead log (WAL), which allows for replication and point-in-time recovery. + +#### What if I want to index vectors with more than 2,000 dimensions? + +You’ll need to use [dimensionality reduction](https://en.wikipedia.org/wiki/Dimensionality_reduction) at the moment. + +#### Why am I seeing less results after adding an index? + +The index was likely created with too little data for the number of lists. Drop the index until the table has more data. + +## Reference + +### Vector Type + +Each vector takes `4 * dimensions + 8` bytes of storage. Each element is a single precision floating-point number (like the `real` type in Postgres), and all elements must be finite (no `NaN`, `Infinity` or `-Infinity`). Vectors can have up to 16,000 dimensions. + +### Vector Operators + +Operator | Description +--- | --- +\+ | element-wise addition +\- | element-wise subtraction +<-> | Euclidean distance +<#> | negative inner product +<=> | cosine distance + +### Vector Functions + +Function | Description +--- | --- +cosine_distance(vector, vector) → double precision | cosine distance +inner_product(vector, vector) → double precision | inner product +l2_distance(vector, vector) → double precision | Euclidean distance +vector_dims(vector) → integer | number of dimensions +vector_norm(vector) → double precision | Euclidean norm + +### Aggregate Functions + +Function | Description +--- | --- +avg(vector) → vector | arithmetic mean + +## Installation Notes + +### Postgres Location + +If your machine has multiple Postgres installations, specify the path to [pg_config](https://www.postgresql.org/docs/current/app-pgconfig.html) with: + +```sh +export PG_CONFIG=/Applications/Postgres.app/Contents/Versions/latest/bin/pg_config +``` + +Then re-run the installation instructions (run `make clean` before `make` if needed). If `sudo` is needed for `make install`, use: + +```sh +sudo --preserve-env=PG_CONFIG make install +``` + +### Missing Header + +If compilation fails with `fatal error: postgres.h: No such file or directory`, make sure Postgres development files are installed on the server. + +For Ubuntu and Debian, use: + +```sh +sudo apt install postgresql-server-dev-15 +``` + +Note: Replace `15` with your Postgres server version + +### Windows + +Support for Windows is currently experimental. Use `nmake` to build: + +```cmd +set "PGROOT=C:\Program Files\PostgreSQL\15" +git clone --branch v0.4.4 https://github.com/pgvector/pgvector.git +cd pgvector +nmake /F Makefile.win +nmake /F Makefile.win install +``` + +## Additional Installation Methods + +### Docker + +Get the [Docker image](https://hub.docker.com/r/ankane/pgvector) with: + +```sh +docker pull ankane/pgvector +``` + +This adds pgvector to the [Postgres image](https://hub.docker.com/_/postgres) (run it the same way). + +You can also build the image manually: + +```sh +git clone --branch v0.4.4 https://github.com/pgvector/pgvector.git +cd pgvector +docker build --build-arg PG_MAJOR=15 -t myuser/pgvector . +``` + +### Homebrew + +With Homebrew Postgres, you can use: + +```sh +brew install pgvector +``` + +Note: This only adds it to the `postgresql@14` formula + +### PGXN + +Install from the [PostgreSQL Extension Network](https://pgxn.org/dist/vector) with: + +```sh +pgxn install vector +``` + +### APT + +Debian and Ubuntu packages are available from the [PostgreSQL APT Repository](https://wiki.postgresql.org/wiki/Apt). Follow the [setup instructions](https://wiki.postgresql.org/wiki/Apt#Quickstart) and run: + +```sh +sudo apt install postgresql-15-pgvector +``` + +Note: Replace `15` with your Postgres server version + +### Yum + +RPM packages are available from the [PostgreSQL Yum Repository](https://yum.postgresql.org/). Follow the [setup instructions](https://www.postgresql.org/download/linux/redhat/) for your distribution and run: + +```sh +sudo yum install pgvector_15 +# or +sudo dnf install pgvector_15 +``` + +Note: Replace `15` with your Postgres server version + +### conda-forge + +With Conda Postgres, install from [conda-forge](https://anaconda.org/conda-forge/pgvector) with: + +```sh +conda install -c conda-forge pgvector +``` + +This method is [community-maintained](https://github.com/conda-forge/pgvector-feedstock) by [@mmcauliffe](https://github.com/mmcauliffe) + +### Postgres.app + +Download the [latest release](https://postgresapp.com/downloads.html) with Postgres 15+. + +## Hosted Postgres + +pgvector is available on [these providers](https://github.com/pgvector/pgvector/issues/54). + +To request a new extension on other providers: + +- Google Cloud SQL - vote or comment on [this page](https://issuetracker.google.com/issues/265172065) +- DigitalOcean Managed Databases - vote or comment on [this page](https://ideas.digitalocean.com/managed-database/p/pgvector-extension-for-postgresql) +- Heroku Postgres - vote or comment on [this page](https://github.com/heroku/roadmap/issues/156) + +## Upgrading + +Install the latest version and run: + +```sql +ALTER EXTENSION vector UPDATE; +``` + +## Upgrade Notes + +### 0.4.0 + +If upgrading with Postgres < 13, remove this line from `sql/vector--0.3.2--0.4.0.sql`: + +```sql +ALTER TYPE vector SET (STORAGE = extended); +``` + +Then run `make install` and `ALTER EXTENSION vector UPDATE;`. + +### 0.3.1 + +If upgrading from 0.2.7 or 0.3.0, recreate all `ivfflat` indexes after upgrading to ensure all data is indexed. + +```sql +-- Postgres 12+ +REINDEX INDEX CONCURRENTLY index_name; + +-- Postgres < 12 +CREATE INDEX CONCURRENTLY temp_name ON table USING ivfflat (column opclass); +DROP INDEX CONCURRENTLY index_name; +ALTER INDEX temp_name RENAME TO index_name; +``` + +## Thanks + +Thanks to: + +- [PASE: PostgreSQL Ultra-High-Dimensional Approximate Nearest Neighbor Search Extension](https://dl.acm.org/doi/pdf/10.1145/3318464.3386131) +- [Faiss: A Library for Efficient Similarity Search and Clustering of Dense Vectors](https://github.com/facebookresearch/faiss) +- [Using the Triangle Inequality to Accelerate k-means](https://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf) +- [k-means++: The Advantage of Careful Seeding](https://theory.stanford.edu/~sergei/papers/kMeansPP-soda.pdf) +- [Concept Decompositions for Large Sparse Text Data using Clustering](https://www.cs.utexas.edu/users/inderjit/public_papers/concept_mlj.pdf) + +## History + +View the [changelog](https://github.com/pgvector/pgvector/blob/master/CHANGELOG.md) + +## Contributing + +Everyone is encouraged to help improve this project. Here are a few ways you can help: + +- [Report bugs](https://github.com/pgvector/pgvector/issues) +- Fix bugs and [submit pull requests](https://github.com/pgvector/pgvector/pulls) +- Write, clarify, or fix documentation +- Suggest or add new features + +To get started with development: + +```sh +git clone https://github.com/pgvector/pgvector.git +cd pgvector +make +make install +``` + +To run all tests: + +```sh +make installcheck # regression tests +make prove_installcheck # TAP tests +``` + +To run single tests: + +```sh +make installcheck REGRESS=functions # regression test +make prove_installcheck PROVE_TESTS=test/t/001_wal.pl # TAP test +``` + +To enable benchmarking: + +```sh +make clean && PG_CFLAGS=-DIVFFLAT_BENCH make && make install +``` + +Resources for contributors + +- [Extension Building Infrastructure](https://www.postgresql.org/docs/current/extend-pgxs.html) +- [Index Access Method Interface Definition](https://www.postgresql.org/docs/current/indexam.html) +- [Generic WAL Records](https://www.postgresql.org/docs/13/generic-wal.html) diff --git a/contrib/pgvector/sql/vector--0.1.0--0.1.1.sql b/contrib/pgvector/sql/vector--0.1.0--0.1.1.sql new file mode 100644 index 0000000000000000000000000000000000000000..959a0d72261a64af629e0c9be78767337743c26d --- /dev/null +++ b/contrib/pgvector/sql/vector--0.1.0--0.1.1.sql @@ -0,0 +1,39 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION vector UPDATE TO '0.1.1'" to load this file. \quit + +CREATE FUNCTION vector_recv(internal, oid, integer) RETURNS vector + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION vector_send(vector) RETURNS bytea + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT; + +ALTER TYPE vector SET ( RECEIVE = vector_recv, SEND = vector_send ); + +-- functions + +ALTER FUNCTION vector_in(cstring, oid, integer) PARALLEL SAFE; +ALTER FUNCTION vector_out(vector) PARALLEL SAFE; +ALTER FUNCTION vector_typmod_in(cstring[]) PARALLEL SAFE; +ALTER FUNCTION vector_recv(internal, oid, integer) PARALLEL SAFE; +ALTER FUNCTION vector_send(vector) PARALLEL SAFE; +ALTER FUNCTION l2_distance(vector, vector) PARALLEL SAFE; +ALTER FUNCTION inner_product(vector, vector) PARALLEL SAFE; +ALTER FUNCTION cosine_distance(vector, vector) PARALLEL SAFE; +ALTER FUNCTION vector_dims(vector) PARALLEL SAFE; +ALTER FUNCTION vector_norm(vector) PARALLEL SAFE; +ALTER FUNCTION vector_add(vector, vector) PARALLEL SAFE; +ALTER FUNCTION vector_sub(vector, vector) PARALLEL SAFE; +ALTER FUNCTION vector_lt(vector, vector) PARALLEL SAFE; +ALTER FUNCTION vector_le(vector, vector) PARALLEL SAFE; +ALTER FUNCTION vector_eq(vector, vector) PARALLEL SAFE; +ALTER FUNCTION vector_ne(vector, vector) PARALLEL SAFE; +ALTER FUNCTION vector_ge(vector, vector) PARALLEL SAFE; +ALTER FUNCTION vector_gt(vector, vector) PARALLEL SAFE; +ALTER FUNCTION vector_cmp(vector, vector) PARALLEL SAFE; +ALTER FUNCTION vector_l2_squared_distance(vector, vector) PARALLEL SAFE; +ALTER FUNCTION vector_negative_inner_product(vector, vector) PARALLEL SAFE; +ALTER FUNCTION vector_spherical_distance(vector, vector) PARALLEL SAFE; +ALTER FUNCTION vector(vector, integer, boolean) PARALLEL SAFE; +ALTER FUNCTION array_to_vector(integer[], integer, boolean) PARALLEL SAFE; +ALTER FUNCTION array_to_vector(real[], integer, boolean) PARALLEL SAFE; +ALTER FUNCTION array_to_vector(double precision[], integer, boolean) PARALLEL SAFE; diff --git a/contrib/pgvector/sql/vector--0.1.1--0.1.3.sql b/contrib/pgvector/sql/vector--0.1.1--0.1.3.sql new file mode 100644 index 0000000000000000000000000000000000000000..391835f865ccc20415cd559e27e5358867eb2bb3 --- /dev/null +++ b/contrib/pgvector/sql/vector--0.1.1--0.1.3.sql @@ -0,0 +1,2 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION vector UPDATE TO '0.1.3'" to load this file. \quit diff --git a/contrib/pgvector/sql/vector--0.1.3--0.1.4.sql b/contrib/pgvector/sql/vector--0.1.3--0.1.4.sql new file mode 100644 index 0000000000000000000000000000000000000000..56ab0eb501c46501ed45079c23dbd8f620d497a7 --- /dev/null +++ b/contrib/pgvector/sql/vector--0.1.3--0.1.4.sql @@ -0,0 +1,2 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION vector UPDATE TO '0.1.4'" to load this file. \quit diff --git a/contrib/pgvector/sql/vector--0.1.4--0.1.5.sql b/contrib/pgvector/sql/vector--0.1.4--0.1.5.sql new file mode 100644 index 0000000000000000000000000000000000000000..3996b2dcd8468ee25b5ff5129493fd0ea71b57f1 --- /dev/null +++ b/contrib/pgvector/sql/vector--0.1.4--0.1.5.sql @@ -0,0 +1,2 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION vector UPDATE TO '0.1.5'" to load this file. \quit diff --git a/contrib/pgvector/sql/vector--0.1.5--0.1.6.sql b/contrib/pgvector/sql/vector--0.1.5--0.1.6.sql new file mode 100644 index 0000000000000000000000000000000000000000..fdb605b0b95831aa8a1d846aa1c6d0613e6ded25 --- /dev/null +++ b/contrib/pgvector/sql/vector--0.1.5--0.1.6.sql @@ -0,0 +1,2 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION vector UPDATE TO '0.1.6'" to load this file. \quit diff --git a/contrib/pgvector/sql/vector--0.1.6--0.1.7.sql b/contrib/pgvector/sql/vector--0.1.6--0.1.7.sql new file mode 100644 index 0000000000000000000000000000000000000000..fcd32f45a90a323e9caf2b19ce69ca5c9e1d4007 --- /dev/null +++ b/contrib/pgvector/sql/vector--0.1.6--0.1.7.sql @@ -0,0 +1,8 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION vector UPDATE TO '0.1.7'" to load this file. \quit + +CREATE FUNCTION array_to_vector(numeric[], integer, boolean) RETURNS vector + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE CAST (numeric[] AS vector) + WITH FUNCTION array_to_vector(numeric[], integer, boolean) AS IMPLICIT; diff --git a/contrib/pgvector/sql/vector--0.1.7--0.1.8.sql b/contrib/pgvector/sql/vector--0.1.7--0.1.8.sql new file mode 100644 index 0000000000000000000000000000000000000000..5a387a76b6d2867775c4f081f56ee47b71d4663a --- /dev/null +++ b/contrib/pgvector/sql/vector--0.1.7--0.1.8.sql @@ -0,0 +1,8 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION vector UPDATE TO '0.1.8'" to load this file. \quit + +CREATE FUNCTION vector_to_float4(vector, integer, boolean) RETURNS real[] + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE CAST (vector AS real[]) + WITH FUNCTION vector_to_float4(vector, integer, boolean) AS IMPLICIT; diff --git a/contrib/pgvector/sql/vector--0.1.8--0.2.0.sql b/contrib/pgvector/sql/vector--0.1.8--0.2.0.sql new file mode 100644 index 0000000000000000000000000000000000000000..1ce0d1efd65862d972eb594ba066ee11ccea0c7f --- /dev/null +++ b/contrib/pgvector/sql/vector--0.1.8--0.2.0.sql @@ -0,0 +1,2 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION vector UPDATE TO '0.2.0'" to load this file. \quit diff --git a/contrib/pgvector/sql/vector--0.2.0--0.2.1.sql b/contrib/pgvector/sql/vector--0.2.0--0.2.1.sql new file mode 100644 index 0000000000000000000000000000000000000000..47606deb3ad59b8566149d6be32879949ff9e23e --- /dev/null +++ b/contrib/pgvector/sql/vector--0.2.0--0.2.1.sql @@ -0,0 +1,19 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION vector UPDATE TO '0.2.1'" to load this file. \quit + +DROP CAST (integer[] AS vector); +DROP CAST (real[] AS vector); +DROP CAST (double precision[] AS vector); +DROP CAST (numeric[] AS vector); + +CREATE CAST (integer[] AS vector) + WITH FUNCTION array_to_vector(integer[], integer, boolean) AS ASSIGNMENT; + +CREATE CAST (real[] AS vector) + WITH FUNCTION array_to_vector(real[], integer, boolean) AS ASSIGNMENT; + +CREATE CAST (double precision[] AS vector) + WITH FUNCTION array_to_vector(double precision[], integer, boolean) AS ASSIGNMENT; + +CREATE CAST (numeric[] AS vector) + WITH FUNCTION array_to_vector(numeric[], integer, boolean) AS ASSIGNMENT; diff --git a/contrib/pgvector/sql/vector--0.2.1--0.2.2.sql b/contrib/pgvector/sql/vector--0.2.1--0.2.2.sql new file mode 100644 index 0000000000000000000000000000000000000000..697c1408d70812f5d144bcd16b24ade4aa3c8d96 --- /dev/null +++ b/contrib/pgvector/sql/vector--0.2.1--0.2.2.sql @@ -0,0 +1,2 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION vector UPDATE TO '0.2.2'" to load this file. \quit diff --git a/contrib/pgvector/sql/vector--0.2.2--0.2.3.sql b/contrib/pgvector/sql/vector--0.2.2--0.2.3.sql new file mode 100644 index 0000000000000000000000000000000000000000..32b07dc228fb46fc6135e82a22b7b6425fecb303 --- /dev/null +++ b/contrib/pgvector/sql/vector--0.2.2--0.2.3.sql @@ -0,0 +1,2 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION vector UPDATE TO '0.2.3'" to load this file. \quit diff --git a/contrib/pgvector/sql/vector--0.2.3--0.2.4.sql b/contrib/pgvector/sql/vector--0.2.3--0.2.4.sql new file mode 100644 index 0000000000000000000000000000000000000000..5d1b34168ba9bdccdf1a08edc177716101632885 --- /dev/null +++ b/contrib/pgvector/sql/vector--0.2.3--0.2.4.sql @@ -0,0 +1,2 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION vector UPDATE TO '0.2.4'" to load this file. \quit diff --git a/contrib/pgvector/sql/vector--0.2.4--0.2.5.sql b/contrib/pgvector/sql/vector--0.2.4--0.2.5.sql new file mode 100644 index 0000000000000000000000000000000000000000..b372ed0c8c30434023f8d2724b85bd89ce054199 --- /dev/null +++ b/contrib/pgvector/sql/vector--0.2.4--0.2.5.sql @@ -0,0 +1,2 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION vector UPDATE TO '0.2.5'" to load this file. \quit diff --git a/contrib/pgvector/sql/vector--0.2.5--0.2.6.sql b/contrib/pgvector/sql/vector--0.2.5--0.2.6.sql new file mode 100644 index 0000000000000000000000000000000000000000..e68c1ac0374de87eb539ea6a5ab10048692d772d --- /dev/null +++ b/contrib/pgvector/sql/vector--0.2.5--0.2.6.sql @@ -0,0 +1,2 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION vector UPDATE TO '0.2.6'" to load this file. \quit diff --git a/contrib/pgvector/sql/vector--0.2.6--0.2.7.sql b/contrib/pgvector/sql/vector--0.2.6--0.2.7.sql new file mode 100644 index 0000000000000000000000000000000000000000..227c2171c41f425b30bc1c3507d3814d686551a9 --- /dev/null +++ b/contrib/pgvector/sql/vector--0.2.6--0.2.7.sql @@ -0,0 +1,2 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION vector UPDATE TO '0.2.7'" to load this file. \quit diff --git a/contrib/pgvector/sql/vector--0.2.7--0.3.0.sql b/contrib/pgvector/sql/vector--0.2.7--0.3.0.sql new file mode 100644 index 0000000000000000000000000000000000000000..7e62d39e7288f0f9c556b0acf91161287a774892 --- /dev/null +++ b/contrib/pgvector/sql/vector--0.2.7--0.3.0.sql @@ -0,0 +1,2 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION vector UPDATE TO '0.3.0'" to load this file. \quit diff --git a/contrib/pgvector/sql/vector--0.3.0--0.3.1.sql b/contrib/pgvector/sql/vector--0.3.0--0.3.1.sql new file mode 100644 index 0000000000000000000000000000000000000000..f1a8fbce5ae13f5a86fc6626c8e3da321e3eb694 --- /dev/null +++ b/contrib/pgvector/sql/vector--0.3.0--0.3.1.sql @@ -0,0 +1,2 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION vector UPDATE TO '0.3.1'" to load this file. \quit diff --git a/contrib/pgvector/sql/vector--0.3.1--0.3.2.sql b/contrib/pgvector/sql/vector--0.3.1--0.3.2.sql new file mode 100644 index 0000000000000000000000000000000000000000..c3461a103398346329d457016f35efacfa07232a --- /dev/null +++ b/contrib/pgvector/sql/vector--0.3.1--0.3.2.sql @@ -0,0 +1,2 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION vector UPDATE TO '0.3.2'" to load this file. \quit diff --git a/contrib/pgvector/sql/vector--0.3.2--0.4.0.sql b/contrib/pgvector/sql/vector--0.3.2--0.4.0.sql new file mode 100644 index 0000000000000000000000000000000000000000..3652664777c0fcc2f6e4f4e9b64dd5ebc8470a31 --- /dev/null +++ b/contrib/pgvector/sql/vector--0.3.2--0.4.0.sql @@ -0,0 +1,23 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION vector UPDATE TO '0.4.0'" to load this file. \quit + +-- remove this single line for Postgres < 13 +ALTER TYPE vector SET (STORAGE = extended); + +CREATE FUNCTION vector_accum(double precision[], vector) RETURNS double precision[] + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION vector_avg(double precision[]) RETURNS vector + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION vector_combine(double precision[], double precision[]) RETURNS double precision[] + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE AGGREGATE avg(vector) ( + SFUNC = vector_accum, + STYPE = double precision[], + FINALFUNC = vector_avg, + COMBINEFUNC = vector_combine, + INITCOND = '{0}', + PARALLEL = SAFE +); diff --git a/contrib/pgvector/sql/vector--0.4.0--0.4.1.sql b/contrib/pgvector/sql/vector--0.4.0--0.4.1.sql new file mode 100644 index 0000000000000000000000000000000000000000..67ba57ef924ed43f5b372f3fa3339655557e38f0 --- /dev/null +++ b/contrib/pgvector/sql/vector--0.4.0--0.4.1.sql @@ -0,0 +1,2 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION vector UPDATE TO '0.4.1'" to load this file. \quit diff --git a/contrib/pgvector/sql/vector--0.4.1--0.4.2.sql b/contrib/pgvector/sql/vector--0.4.1--0.4.2.sql new file mode 100644 index 0000000000000000000000000000000000000000..24abacce05f2183d498cf4663ae5ae763581aa9b --- /dev/null +++ b/contrib/pgvector/sql/vector--0.4.1--0.4.2.sql @@ -0,0 +1,2 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION vector UPDATE TO '0.4.2'" to load this file. \quit diff --git a/contrib/pgvector/sql/vector--0.4.2--0.4.3.sql b/contrib/pgvector/sql/vector--0.4.2--0.4.3.sql new file mode 100644 index 0000000000000000000000000000000000000000..3db510e557ee6572c032ce8bc6fbc18f1fbd7ee5 --- /dev/null +++ b/contrib/pgvector/sql/vector--0.4.2--0.4.3.sql @@ -0,0 +1,2 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION vector UPDATE TO '0.4.3'" to load this file. \quit diff --git a/contrib/pgvector/sql/vector--0.4.3--0.4.4.sql b/contrib/pgvector/sql/vector--0.4.3--0.4.4.sql new file mode 100644 index 0000000000000000000000000000000000000000..49c4ab4ef77ae8b53fd3bb2540a3873721d38680 --- /dev/null +++ b/contrib/pgvector/sql/vector--0.4.3--0.4.4.sql @@ -0,0 +1,2 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION vector UPDATE TO '0.4.4'" to load this file. \quit diff --git a/contrib/pgvector/sql/vector.sql b/contrib/pgvector/sql/vector.sql new file mode 100644 index 0000000000000000000000000000000000000000..6188e2eafb12197d7ffab25f19eb17dfa7ba3a3f --- /dev/null +++ b/contrib/pgvector/sql/vector.sql @@ -0,0 +1,251 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION vector" to load this file. \quit + +-- type + +CREATE TYPE vector; + +CREATE FUNCTION vector_in(cstring, oid, integer) RETURNS vector + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION vector_out(vector) RETURNS cstring + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION vector_typmod_in(cstring[]) RETURNS integer + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION vector_recv(internal, oid, integer) RETURNS vector + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION vector_send(vector) RETURNS bytea + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE TYPE vector ( + INPUT = vector_in, + OUTPUT = vector_out, + TYPMOD_IN = vector_typmod_in, + RECEIVE = vector_recv, + SEND = vector_send, + STORAGE = extended +); + +-- functions + +CREATE FUNCTION l2_distance(vector, vector) RETURNS float8 + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION inner_product(vector, vector) RETURNS float8 + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION cosine_distance(vector, vector) RETURNS float8 + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION vector_dims(vector) RETURNS integer + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION vector_norm(vector) RETURNS float8 + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION vector_add(vector, vector) RETURNS vector + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION vector_sub(vector, vector) RETURNS vector + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +-- private functions + +CREATE FUNCTION vector_lt(vector, vector) RETURNS bool + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION vector_le(vector, vector) RETURNS bool + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION vector_eq(vector, vector) RETURNS bool + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION vector_ne(vector, vector) RETURNS bool + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION vector_ge(vector, vector) RETURNS bool + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION vector_gt(vector, vector) RETURNS bool + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION vector_cmp(vector, vector) RETURNS int4 + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION vector_l2_squared_distance(vector, vector) RETURNS float8 + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION vector_negative_inner_product(vector, vector) RETURNS float8 + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION vector_spherical_distance(vector, vector) RETURNS float8 + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION vector_accum(double precision[], vector) RETURNS double precision[] + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION vector_avg(double precision[]) RETURNS vector + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION vector_combine(double precision[], double precision[]) RETURNS double precision[] + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +-- aggregates + +CREATE AGGREGATE avg(vector) ( + SFUNC = vector_accum, + STYPE = double precision[], + FINALFUNC = vector_avg, + COMBINEFUNC = vector_combine, + INITCOND = '{0}', + PARALLEL = SAFE +); + +-- cast functions + +CREATE FUNCTION vector(vector, integer, boolean) RETURNS vector + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION array_to_vector(integer[], integer, boolean) RETURNS vector + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION array_to_vector(real[], integer, boolean) RETURNS vector + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION array_to_vector(double precision[], integer, boolean) RETURNS vector + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION array_to_vector(numeric[], integer, boolean) RETURNS vector + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION vector_to_float4(vector, integer, boolean) RETURNS real[] + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +-- casts + +CREATE CAST (vector AS vector) + WITH FUNCTION vector(vector, integer, boolean) AS IMPLICIT; + +CREATE CAST (vector AS real[]) + WITH FUNCTION vector_to_float4(vector, integer, boolean) AS IMPLICIT; + +CREATE CAST (integer[] AS vector) + WITH FUNCTION array_to_vector(integer[], integer, boolean) AS ASSIGNMENT; + +CREATE CAST (real[] AS vector) + WITH FUNCTION array_to_vector(real[], integer, boolean) AS ASSIGNMENT; + +CREATE CAST (double precision[] AS vector) + WITH FUNCTION array_to_vector(double precision[], integer, boolean) AS ASSIGNMENT; + +CREATE CAST (numeric[] AS vector) + WITH FUNCTION array_to_vector(numeric[], integer, boolean) AS ASSIGNMENT; + +-- operators + +CREATE OPERATOR <-> ( + LEFTARG = vector, RIGHTARG = vector, PROCEDURE = l2_distance, + COMMUTATOR = '<->' +); + +CREATE OPERATOR <#> ( + LEFTARG = vector, RIGHTARG = vector, PROCEDURE = vector_negative_inner_product, + COMMUTATOR = '<#>' +); + +CREATE OPERATOR <=> ( + LEFTARG = vector, RIGHTARG = vector, PROCEDURE = cosine_distance, + COMMUTATOR = '<=>' +); + +CREATE OPERATOR + ( + LEFTARG = vector, RIGHTARG = vector, PROCEDURE = vector_add, + COMMUTATOR = + +); + +CREATE OPERATOR - ( + LEFTARG = vector, RIGHTARG = vector, PROCEDURE = vector_sub, + COMMUTATOR = - +); + +CREATE OPERATOR < ( + LEFTARG = vector, RIGHTARG = vector, PROCEDURE = vector_lt, + COMMUTATOR = > , NEGATOR = >= , + RESTRICT = scalarltsel, JOIN = scalarltjoinsel +); + +-- should use scalarlesel and scalarlejoinsel, but not supported in Postgres < 11 +CREATE OPERATOR <= ( + LEFTARG = vector, RIGHTARG = vector, PROCEDURE = vector_le, + COMMUTATOR = >= , NEGATOR = > , + RESTRICT = scalarltsel, JOIN = scalarltjoinsel +); + +CREATE OPERATOR = ( + LEFTARG = vector, RIGHTARG = vector, PROCEDURE = vector_eq, + COMMUTATOR = = , NEGATOR = <> , + RESTRICT = eqsel, JOIN = eqjoinsel +); + +CREATE OPERATOR <> ( + LEFTARG = vector, RIGHTARG = vector, PROCEDURE = vector_ne, + COMMUTATOR = <> , NEGATOR = = , + RESTRICT = eqsel, JOIN = eqjoinsel +); + +-- should use scalargesel and scalargejoinsel, but not supported in Postgres < 11 +CREATE OPERATOR >= ( + LEFTARG = vector, RIGHTARG = vector, PROCEDURE = vector_ge, + COMMUTATOR = <= , NEGATOR = < , + RESTRICT = scalargtsel, JOIN = scalargtjoinsel +); + +CREATE OPERATOR > ( + LEFTARG = vector, RIGHTARG = vector, PROCEDURE = vector_gt, + COMMUTATOR = < , NEGATOR = <= , + RESTRICT = scalargtsel, JOIN = scalargtjoinsel +); + +-- access method + +CREATE FUNCTION ivfflathandler(internal) RETURNS index_am_handler + AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE ACCESS METHOD ivfflat TYPE INDEX HANDLER ivfflathandler; + +COMMENT ON ACCESS METHOD ivfflat IS 'ivfflat index access method'; + +-- opclasses + +CREATE OPERATOR CLASS vector_ops + DEFAULT FOR TYPE vector USING btree AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + FUNCTION 1 vector_cmp(vector, vector); + +CREATE OPERATOR CLASS vector_l2_ops + DEFAULT FOR TYPE vector USING ivfflat AS + OPERATOR 1 <-> (vector, vector) FOR ORDER BY float_ops, + FUNCTION 1 vector_l2_squared_distance(vector, vector), + FUNCTION 3 l2_distance(vector, vector); + +CREATE OPERATOR CLASS vector_ip_ops + FOR TYPE vector USING ivfflat AS + OPERATOR 1 <#> (vector, vector) FOR ORDER BY float_ops, + FUNCTION 1 vector_negative_inner_product(vector, vector), + FUNCTION 3 vector_spherical_distance(vector, vector), + FUNCTION 4 vector_norm(vector); + +CREATE OPERATOR CLASS vector_cosine_ops + FOR TYPE vector USING ivfflat AS + OPERATOR 1 <=> (vector, vector) FOR ORDER BY float_ops, + FUNCTION 1 vector_negative_inner_product(vector, vector), + FUNCTION 2 vector_norm(vector), + FUNCTION 3 vector_spherical_distance(vector, vector), + FUNCTION 4 vector_norm(vector); diff --git a/contrib/pgvector/src/ivfbuild.c b/contrib/pgvector/src/ivfbuild.c new file mode 100644 index 0000000000000000000000000000000000000000..f74a821afa45f3d9648720dccfaf0e7e7f907a95 --- /dev/null +++ b/contrib/pgvector/src/ivfbuild.c @@ -0,0 +1,664 @@ +#include "postgres.h" + +#include + +#include "catalog/index.h" +#include "ivfflat.h" +#include "miscadmin.h" +#include "storage/bufmgr.h" +#include "utils/memutils.h" + +#if PG_VERSION_NUM >= 140000 +#include "utils/backend_progress.h" +#elif PG_VERSION_NUM >= 120000 +#include "pgstat.h" +#endif + +#if PG_VERSION_NUM >= 120000 +#include "access/tableam.h" +#include "commands/progress.h" +#else +#define PROGRESS_CREATEIDX_SUBPHASE 0 +#define PROGRESS_CREATEIDX_TUPLES_TOTAL 0 +#define PROGRESS_CREATEIDX_TUPLES_DONE 0 +#endif + +#include "catalog/pg_operator_d.h" +#include "catalog/pg_type_d.h" + +#if PG_VERSION_NUM >= 130000 +#define CALLBACK_ITEM_POINTER ItemPointer tid +#else +#define CALLBACK_ITEM_POINTER HeapTuple hup +#endif + +#if PG_VERSION_NUM >= 120000 +#define UpdateProgress(index, val) pgstat_progress_update_param(index, val) +#else +#define UpdateProgress(index, val) ((void)val) +#endif + +/* + * Add sample + */ +static void +AddSample(Datum *values, IvfflatBuildState * buildstate) +{ + VectorArray samples = buildstate->samples; + int targsamples = samples->maxlen; + + /* Detoast once for all calls */ + Datum value = PointerGetDatum(PG_DETOAST_DATUM(values[0])); + + /* + * Normalize with KMEANS_NORM_PROC since spherical distance function + * expects unit vectors + */ + if (buildstate->kmeansnormprocinfo != NULL) + { + if (!IvfflatNormValue(buildstate->kmeansnormprocinfo, buildstate->collation, &value, buildstate->normvec)) + return; + } + + if (samples->length < targsamples) + { + VectorArraySet(samples, samples->length, DatumGetVector(value)); + samples->length++; + } + else + { + if (buildstate->rowstoskip < 0) + buildstate->rowstoskip = reservoir_get_next_S(&buildstate->rstate, samples->length, targsamples); + + if (buildstate->rowstoskip <= 0) + { +#if PG_VERSION_NUM >= 150000 + int k = (int) (targsamples * sampler_random_fract(&buildstate->rstate.randstate)); +#else + int k = (int) (targsamples * sampler_random_fract(buildstate->rstate.randstate)); +#endif + + Assert(k >= 0 && k < targsamples); + VectorArraySet(samples, k, DatumGetVector(value)); + } + + buildstate->rowstoskip -= 1; + } +} + +/* + * Callback for sampling + */ +static void +SampleCallback(Relation index, CALLBACK_ITEM_POINTER, Datum *values, + bool *isnull, bool tupleIsAlive, void *state) +{ + IvfflatBuildState *buildstate = (IvfflatBuildState *) state; + MemoryContext oldCtx; + + /* Skip nulls */ + if (isnull[0]) + return; + + /* Use memory context since detoast can allocate */ + oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx); + + /* Add sample */ + AddSample(values, state); + + /* Reset memory context */ + MemoryContextSwitchTo(oldCtx); + MemoryContextReset(buildstate->tmpCtx); +} + +/* + * Sample rows with same logic as ANALYZE + */ +static void +SampleRows(IvfflatBuildState * buildstate) +{ + int targsamples = buildstate->samples->maxlen; + BlockNumber totalblocks = RelationGetNumberOfBlocks(buildstate->heap); + + buildstate->rowstoskip = -1; + + BlockSampler_Init(&buildstate->bs, totalblocks, targsamples, RandomInt()); + + reservoir_init_selection_state(&buildstate->rstate, targsamples); + while (BlockSampler_HasMore(&buildstate->bs)) + { + BlockNumber targblock = BlockSampler_Next(&buildstate->bs); + +#if PG_VERSION_NUM >= 120000 + table_index_build_range_scan(buildstate->heap, buildstate->index, buildstate->indexInfo, + false, true, false, targblock, 1, SampleCallback, (void *) buildstate, NULL); +#else + IndexBuildHeapRangeScan(buildstate->heap, buildstate->index, buildstate->indexInfo, + false, true, targblock, 1, SampleCallback, (void *) buildstate, NULL); +#endif + } +} + +/* + * Add tuple to sort + */ +static void +AddTupleToSort(Relation index, ItemPointer tid, Datum *values, IvfflatBuildState * buildstate) +{ + double distance; + double minDistance = DBL_MAX; + int closestCenter = 0; + VectorArray centers = buildstate->centers; + TupleTableSlot *slot = buildstate->slot; + int i; + + /* Detoast once for all calls */ + Datum value = PointerGetDatum(PG_DETOAST_DATUM(values[0])); + + /* Normalize if needed */ + if (buildstate->normprocinfo != NULL) + { + if (!IvfflatNormValue(buildstate->normprocinfo, buildstate->collation, &value, buildstate->normvec)) + return; + } + + /* Find the list that minimizes the distance */ + for (i = 0; i < centers->length; i++) + { + distance = DatumGetFloat8(FunctionCall2Coll(buildstate->procinfo, buildstate->collation, value, PointerGetDatum(VectorArrayGet(centers, i)))); + + if (distance < minDistance) + { + minDistance = distance; + closestCenter = i; + } + } + +#ifdef IVFFLAT_KMEANS_DEBUG + buildstate->inertia += minDistance; + buildstate->listSums[closestCenter] += minDistance; + buildstate->listCounts[closestCenter]++; +#endif + + /* Create a virtual tuple */ + ExecClearTuple(slot); + slot->tts_values[0] = Int32GetDatum(closestCenter); + slot->tts_isnull[0] = false; + slot->tts_values[1] = PointerGetDatum(tid); + slot->tts_isnull[1] = false; + slot->tts_values[2] = value; + slot->tts_isnull[2] = false; + ExecStoreVirtualTuple(slot); + + /* + * Add tuple to sort + * + * tuplesort_puttupleslot comment: Input data is always copied; the caller + * need not save it. + */ + tuplesort_puttupleslot(buildstate->sortstate, slot); + + buildstate->indtuples++; +} + +/* + * Callback for table_index_build_scan + */ +static void +BuildCallback(Relation index, CALLBACK_ITEM_POINTER, Datum *values, + bool *isnull, bool tupleIsAlive, void *state) +{ + IvfflatBuildState *buildstate = (IvfflatBuildState *) state; + MemoryContext oldCtx; + +#if PG_VERSION_NUM < 130000 + ItemPointer tid = &hup->t_self; +#endif + + /* Skip nulls */ + if (isnull[0]) + return; + + /* Use memory context since detoast can allocate */ + oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx); + + /* Add tuple to sort */ + AddTupleToSort(index, tid, values, buildstate); + + /* Reset memory context */ + MemoryContextSwitchTo(oldCtx); + MemoryContextReset(buildstate->tmpCtx); +} + +/* + * Get index tuple from sort state + */ +static inline void +GetNextTuple(Tuplesortstate *sortstate, TupleDesc tupdesc, TupleTableSlot *slot, IndexTuple *itup, int *list) +{ + Datum value; + bool isnull; + + if (tuplesort_gettupleslot(sortstate, true, false, slot, NULL)) + { + *list = DatumGetInt32(slot_getattr(slot, 1, &isnull)); + value = slot_getattr(slot, 3, &isnull); + + /* Form the index tuple */ + *itup = index_form_tuple(tupdesc, &value, &isnull); + (*itup)->t_tid = *((ItemPointer) DatumGetPointer(slot_getattr(slot, 2, &isnull))); + } + else + *list = -1; +} + +/* + * Create initial entry pages + */ +static void +InsertTuples(Relation index, IvfflatBuildState * buildstate, ForkNumber forkNum) +{ + Buffer buf; + Page page; + GenericXLogState *state; + int list; + IndexTuple itup = NULL; /* silence compiler warning */ + BlockNumber startPage; + BlockNumber insertPage; + Size itemsz; + int i; + int64 inserted = 0; + +#if PG_VERSION_NUM >= 120000 + TupleTableSlot *slot = MakeSingleTupleTableSlot(buildstate->tupdesc, &TTSOpsMinimalTuple); +#else + TupleTableSlot *slot = MakeSingleTupleTableSlot(buildstate->tupdesc); +#endif + TupleDesc tupdesc = RelationGetDescr(index); + + UpdateProgress(PROGRESS_CREATEIDX_SUBPHASE, PROGRESS_IVFFLAT_PHASE_LOAD); + + UpdateProgress(PROGRESS_CREATEIDX_TUPLES_TOTAL, buildstate->indtuples); + + GetNextTuple(buildstate->sortstate, tupdesc, slot, &itup, &list); + + for (i = 0; i < buildstate->centers->length; i++) + { + /* Can take a while, so ensure we can interrupt */ + /* Needs to be called when no buffer locks are held */ + CHECK_FOR_INTERRUPTS(); + + buf = IvfflatNewBuffer(index, forkNum); + IvfflatInitRegisterPage(index, &buf, &page, &state); + + startPage = BufferGetBlockNumber(buf); + + /* Get all tuples for list */ + while (list == i) + { + /* Check for free space */ + itemsz = MAXALIGN(IndexTupleSize(itup)); + if (PageGetFreeSpace(page) < itemsz) + IvfflatAppendPage(index, &buf, &page, &state, forkNum); + + /* Add the item */ + if (PageAddItem(page, (Item) itup, itemsz, InvalidOffsetNumber, false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add index item to \"%s\"", RelationGetRelationName(index)); + + pfree(itup); + + UpdateProgress(PROGRESS_CREATEIDX_TUPLES_DONE, ++inserted); + + GetNextTuple(buildstate->sortstate, tupdesc, slot, &itup, &list); + } + + insertPage = BufferGetBlockNumber(buf); + + IvfflatCommitBuffer(buf, state); + + /* Set the start and insert pages */ + IvfflatUpdateList(index, state, buildstate->listInfo[i], insertPage, InvalidBlockNumber, startPage, forkNum); + } +} + +/* + * Initialize the build state + */ +static void +InitBuildState(IvfflatBuildState * buildstate, Relation heap, Relation index, IndexInfo *indexInfo) +{ + buildstate->heap = heap; + buildstate->index = index; + buildstate->indexInfo = indexInfo; + + buildstate->lists = IvfflatGetLists(index); + buildstate->dimensions = TupleDescAttr(index->rd_att, 0)->atttypmod; + + /* Require column to have dimensions to be indexed */ + if (buildstate->dimensions < 0) + elog(ERROR, "column does not have dimensions"); + + if (buildstate->dimensions > IVFFLAT_MAX_DIM) + elog(ERROR, "column cannot have more than %d dimensions for ivfflat index", IVFFLAT_MAX_DIM); + + buildstate->reltuples = 0; + buildstate->indtuples = 0; + + /* Get support functions */ + buildstate->procinfo = index_getprocinfo(index, 1, IVFFLAT_DISTANCE_PROC); + buildstate->normprocinfo = IvfflatOptionalProcInfo(index, IVFFLAT_NORM_PROC); + buildstate->kmeansnormprocinfo = IvfflatOptionalProcInfo(index, IVFFLAT_KMEANS_NORM_PROC); + buildstate->collation = index->rd_indcollation[0]; + + /* Require more than one dimension for spherical k-means */ + /* Lists check for backwards compatibility */ + /* TODO Remove lists check in 0.3.0 */ + if (buildstate->kmeansnormprocinfo != NULL && buildstate->dimensions == 1 && buildstate->lists > 1) + elog(ERROR, "dimensions must be greater than one for this opclass"); + + /* Create tuple description for sorting */ +#if PG_VERSION_NUM >= 120000 + buildstate->tupdesc = CreateTemplateTupleDesc(3); +#else + buildstate->tupdesc = CreateTemplateTupleDesc(3, false); +#endif + TupleDescInitEntry(buildstate->tupdesc, (AttrNumber) 1, "list", INT4OID, -1, 0); + TupleDescInitEntry(buildstate->tupdesc, (AttrNumber) 2, "tid", TIDOID, -1, 0); + TupleDescInitEntry(buildstate->tupdesc, (AttrNumber) 3, "vector", RelationGetDescr(index)->attrs[0].atttypid, -1, 0); + +#if PG_VERSION_NUM >= 120000 + buildstate->slot = MakeSingleTupleTableSlot(buildstate->tupdesc, &TTSOpsVirtual); +#else + buildstate->slot = MakeSingleTupleTableSlot(buildstate->tupdesc); +#endif + + buildstate->centers = VectorArrayInit(buildstate->lists, buildstate->dimensions); + buildstate->listInfo = palloc(sizeof(ListInfo) * buildstate->lists); + + /* Reuse for each tuple */ + buildstate->normvec = InitVector(buildstate->dimensions); + + buildstate->tmpCtx = AllocSetContextCreate(CurrentMemoryContext, + "Ivfflat build temporary context", + ALLOCSET_DEFAULT_SIZES); + +#ifdef IVFFLAT_KMEANS_DEBUG + buildstate->inertia = 0; + buildstate->listSums = palloc0(sizeof(double) * buildstate->lists); + buildstate->listCounts = palloc0(sizeof(int) * buildstate->lists); +#endif +} + +/* + * Free resources + */ +static void +FreeBuildState(IvfflatBuildState * buildstate) +{ + VectorArrayFree(buildstate->centers); + pfree(buildstate->listInfo); + pfree(buildstate->normvec); + +#ifdef IVFFLAT_KMEANS_DEBUG + pfree(buildstate->listSums); + pfree(buildstate->listCounts); +#endif + + MemoryContextDelete(buildstate->tmpCtx); +} + +/* + * Compute centers + */ +static void +ComputeCenters(IvfflatBuildState * buildstate) +{ + int numSamples; + + UpdateProgress(PROGRESS_CREATEIDX_SUBPHASE, PROGRESS_IVFFLAT_PHASE_KMEANS); + + /* Target 50 samples per list, with at least 10000 samples */ + /* The number of samples has a large effect on index build time */ + numSamples = buildstate->lists * 50; + if (numSamples < 10000) + numSamples = 10000; + + /* Skip samples for unlogged table */ + if (buildstate->heap == NULL) + numSamples = 1; + + /* Sample rows */ + /* TODO Ensure within maintenance_work_mem */ + buildstate->samples = VectorArrayInit(numSamples, buildstate->dimensions); + if (buildstate->heap != NULL) + { + SampleRows(buildstate); + + if (buildstate->samples->length < buildstate->lists) + { + ereport(NOTICE, + (errmsg("ivfflat index created with little data"), + errdetail("This will cause low recall."), + errhint("Drop the index until the table has more data."))); + } + } + + /* Calculate centers */ + IvfflatBench("k-means", IvfflatKmeans(buildstate->index, buildstate->samples, buildstate->centers)); + + /* Free samples before we allocate more memory */ + VectorArrayFree(buildstate->samples); +} + +/* + * Create the metapage + */ +static void +CreateMetaPage(Relation index, int dimensions, int lists, ForkNumber forkNum) +{ + Buffer buf; + Page page; + GenericXLogState *state; + IvfflatMetaPage metap; + + buf = IvfflatNewBuffer(index, forkNum); + IvfflatInitRegisterPage(index, &buf, &page, &state); + + /* Set metapage data */ + metap = IvfflatPageGetMeta(page); + metap->magicNumber = IVFFLAT_MAGIC_NUMBER; + metap->version = IVFFLAT_VERSION; + metap->dimensions = dimensions; + metap->lists = lists; + ((PageHeader) page)->pd_lower = + ((char *) metap + sizeof(IvfflatMetaPageData)) - (char *) page; + + IvfflatCommitBuffer(buf, state); +} + +/* + * Create list pages + */ +static void +CreateListPages(Relation index, VectorArray centers, int dimensions, + int lists, ForkNumber forkNum, ListInfo * *listInfo) +{ + int i; + Buffer buf; + Page page; + GenericXLogState *state; + OffsetNumber offno; + Size itemsz; + IvfflatList list; + + itemsz = MAXALIGN(IVFFLAT_LIST_SIZE(dimensions)); + list = palloc(itemsz); + + buf = IvfflatNewBuffer(index, forkNum); + IvfflatInitRegisterPage(index, &buf, &page, &state); + + for (i = 0; i < lists; i++) + { + /* Load list */ + list->startPage = InvalidBlockNumber; + list->insertPage = InvalidBlockNumber; + memcpy(&list->center, VectorArrayGet(centers, i), VECTOR_SIZE(dimensions)); + + /* Ensure free space */ + if (PageGetFreeSpace(page) < itemsz) + IvfflatAppendPage(index, &buf, &page, &state, forkNum); + + /* Add the item */ + offno = PageAddItem(page, (Item) list, itemsz, InvalidOffsetNumber, false, false); + if (offno == InvalidOffsetNumber) + elog(ERROR, "failed to add index item to \"%s\"", RelationGetRelationName(index)); + + /* Save location info */ + (*listInfo)[i].blkno = BufferGetBlockNumber(buf); + (*listInfo)[i].offno = offno; + } + + IvfflatCommitBuffer(buf, state); + + pfree(list); +} + +/* + * Print k-means metrics + */ +#ifdef IVFFLAT_KMEANS_DEBUG +static void +PrintKmeansMetrics(IvfflatBuildState * buildstate) +{ + elog(INFO, "inertia: %.3e", buildstate->inertia); + + /* Calculate Davies-Bouldin index */ + if (buildstate->lists > 1) + { + double db = 0.0; + + /* Calculate average distance */ + for (int i = 0; i < buildstate->lists; i++) + { + if (buildstate->listCounts[i] > 0) + buildstate->listSums[i] /= buildstate->listCounts[i]; + } + + for (int i = 0; i < buildstate->lists; i++) + { + double max = 0.0; + double distance; + + for (int j = 0; j < buildstate->lists; j++) + { + if (j == i) + continue; + + distance = DatumGetFloat8(FunctionCall2Coll(buildstate->procinfo, buildstate->collation, PointerGetDatum(VectorArrayGet(buildstate->centers, i)), PointerGetDatum(VectorArrayGet(buildstate->centers, j)))); + distance = (buildstate->listSums[i] + buildstate->listSums[j]) / distance; + + if (distance > max) + max = distance; + } + db += max; + } + db /= buildstate->lists; + elog(INFO, "davies-bouldin: %.3f", db); + } +} +#endif + +/* + * Scan table for tuples to index + */ +static void +ScanTable(IvfflatBuildState * buildstate) +{ +#if PG_VERSION_NUM >= 120000 + buildstate->reltuples = table_index_build_scan(buildstate->heap, buildstate->index, buildstate->indexInfo, + true, true, BuildCallback, (void *) buildstate, NULL); +#else + buildstate->reltuples = IndexBuildHeapScan(buildstate->heap, buildstate->index, buildstate->indexInfo, + true, BuildCallback, (void *) buildstate, NULL); +#endif +} + +/* + * Create entry pages + */ +static void +CreateEntryPages(IvfflatBuildState * buildstate, ForkNumber forkNum) +{ + AttrNumber attNums[] = {1}; + Oid sortOperators[] = {Int4LessOperator}; + Oid sortCollations[] = {InvalidOid}; + bool nullsFirstFlags[] = {false}; + + UpdateProgress(PROGRESS_CREATEIDX_SUBPHASE, PROGRESS_IVFFLAT_PHASE_SORT); + + buildstate->sortstate = tuplesort_begin_heap(buildstate->tupdesc, 1, attNums, sortOperators, sortCollations, nullsFirstFlags, maintenance_work_mem, NULL, false); + + /* Add tuples to sort */ + if (buildstate->heap != NULL) + IvfflatBench("assign tuples", ScanTable(buildstate)); + + /* Sort */ + IvfflatBench("sort tuples", tuplesort_performsort(buildstate->sortstate)); + +#ifdef IVFFLAT_KMEANS_DEBUG + PrintKmeansMetrics(buildstate); +#endif + + /* Insert */ + IvfflatBench("load tuples", InsertTuples(buildstate->index, buildstate, forkNum)); + tuplesort_end(buildstate->sortstate); +} + +/* + * Build the index + */ +static void +BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo, + IvfflatBuildState * buildstate, ForkNumber forkNum) +{ + InitBuildState(buildstate, heap, index, indexInfo); + + ComputeCenters(buildstate); + + /* Create pages */ + CreateMetaPage(index, buildstate->dimensions, buildstate->lists, forkNum); + CreateListPages(index, buildstate->centers, buildstate->dimensions, buildstate->lists, forkNum, &buildstate->listInfo); + CreateEntryPages(buildstate, forkNum); + + FreeBuildState(buildstate); +} + +/* + * Build the index for a logged table + */ +IndexBuildResult * +ivfflatbuild(Relation heap, Relation index, IndexInfo *indexInfo) +{ + IndexBuildResult *result; + IvfflatBuildState buildstate; + + BuildIndex(heap, index, indexInfo, &buildstate, MAIN_FORKNUM); + + result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); + result->heap_tuples = buildstate.reltuples; + result->index_tuples = buildstate.indtuples; + + return result; +} + +/* + * Build the index for an unlogged table + */ +void +ivfflatbuildempty(Relation index) +{ + IndexInfo *indexInfo = BuildIndexInfo(index); + IvfflatBuildState buildstate; + + BuildIndex(NULL, index, indexInfo, &buildstate, INIT_FORKNUM); +} diff --git a/contrib/pgvector/src/ivfflat.c b/contrib/pgvector/src/ivfflat.c new file mode 100644 index 0000000000000000000000000000000000000000..ef926a872d8dbeb11a3c08157729b1fada3b2755 --- /dev/null +++ b/contrib/pgvector/src/ivfflat.c @@ -0,0 +1,251 @@ +#include "postgres.h" + +#include + +#include "access/amapi.h" +#include "commands/vacuum.h" +#include "ivfflat.h" +#include "utils/guc.h" +#include "utils/selfuncs.h" +#include "utils/spccache.h" + +#if PG_VERSION_NUM >= 120000 +#include "commands/progress.h" +#endif + +int ivfflat_probes; +static relopt_kind ivfflat_relopt_kind; + +/* + * Initialize index options and variables + */ +void +_PG_init(void) +{ + ivfflat_relopt_kind = add_reloption_kind(); + add_int_reloption(ivfflat_relopt_kind, "lists", "Number of inverted lists", + IVFFLAT_DEFAULT_LISTS, 1, IVFFLAT_MAX_LISTS +#if PG_VERSION_NUM >= 130000 + ,AccessExclusiveLock +#endif + ); + + DefineCustomIntVariable("ivfflat.probes", "Sets the number of probes", + "Valid range is 1..lists.", &ivfflat_probes, + 1, 1, IVFFLAT_MAX_LISTS, PGC_USERSET, 0, NULL, NULL, NULL); +} + +/* + * Get the name of index build phase + */ +#if PG_VERSION_NUM >= 120000 +static char * +ivfflatbuildphasename(int64 phasenum) +{ + switch (phasenum) + { + case PROGRESS_CREATEIDX_SUBPHASE_INITIALIZE: + return "initializing"; + case PROGRESS_IVFFLAT_PHASE_KMEANS: + return "performing k-means"; + case PROGRESS_IVFFLAT_PHASE_SORT: + return "sorting tuples"; + case PROGRESS_IVFFLAT_PHASE_LOAD: + return "loading tuples"; + default: + return NULL; + } +} +#endif + +/* + * Estimate the cost of an index scan + */ +static void +ivfflatcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, + Cost *indexStartupCost, Cost *indexTotalCost, + Selectivity *indexSelectivity, double *indexCorrelation, + double *indexPages) +{ + GenericCosts costs; + int lists; + double ratio; + double spc_seq_page_cost; + Relation indexRel; +#if PG_VERSION_NUM < 120000 + List *qinfos; +#endif + + /* Never use index without order */ + if (path->indexorderbys == NULL) + { + *indexStartupCost = DBL_MAX; + *indexTotalCost = DBL_MAX; + *indexSelectivity = 0; + *indexCorrelation = 0; + *indexPages = 0; + return; + } + + MemSet(&costs, 0, sizeof(costs)); + + indexRel = index_open(path->indexinfo->indexoid, NoLock); + lists = IvfflatGetLists(indexRel); + index_close(indexRel, NoLock); + + /* Get the ratio of lists that we need to visit */ + ratio = ((double) ivfflat_probes) / lists; + if (ratio > 1.0) + ratio = 1.0; + + /* + * This gives us the subset of tuples to visit. This value is passed into + * the generic cost estimator to determine the number of pages to visit + * during the index scan. + */ + costs.numIndexTuples = path->indexinfo->tuples * ratio; + +#if PG_VERSION_NUM >= 120000 + genericcostestimate(root, path, loop_count, &costs); +#else + qinfos = deconstruct_indexquals(path); + genericcostestimate(root, path, loop_count, qinfos, &costs); +#endif + + get_tablespace_page_costs(path->indexinfo->reltablespace, NULL, &spc_seq_page_cost); + + /* Adjust cost if needed since TOAST not included in seq scan cost */ + if (costs.numIndexPages > path->indexinfo->rel->pages && ratio < 0.5) + { + /* Change all page cost from random to sequential */ + costs.indexTotalCost -= costs.numIndexPages * (costs.spc_random_page_cost - spc_seq_page_cost); + + /* Remove cost of extra pages */ + costs.indexTotalCost -= (costs.numIndexPages - path->indexinfo->rel->pages) * spc_seq_page_cost; + } + else + { + /* Change some page cost from random to sequential */ + costs.indexTotalCost -= 0.5 * costs.numIndexPages * (costs.spc_random_page_cost - spc_seq_page_cost); + } + + /* + * If the list selectivity is lower than what is returned from the generic + * cost estimator, use that. + */ + if (ratio < costs.indexSelectivity) + costs.indexSelectivity = ratio; + + /* Use total cost since most work happens before first tuple is returned */ + *indexStartupCost = costs.indexTotalCost; + *indexTotalCost = costs.indexTotalCost; + *indexSelectivity = costs.indexSelectivity; + *indexCorrelation = costs.indexCorrelation; + *indexPages = costs.numIndexPages; +} + +/* + * Parse and validate the reloptions + */ +static bytea * +ivfflatoptions(Datum reloptions, bool validate) +{ + static const relopt_parse_elt tab[] = { + {"lists", RELOPT_TYPE_INT, offsetof(IvfflatOptions, lists)}, + }; + +#if PG_VERSION_NUM >= 130000 + return (bytea *) build_reloptions(reloptions, validate, + ivfflat_relopt_kind, + sizeof(IvfflatOptions), + tab, lengthof(tab)); +#else + relopt_value *options; + int numoptions; + IvfflatOptions *rdopts; + + options = parseRelOptions(reloptions, validate, ivfflat_relopt_kind, &numoptions); + rdopts = allocateReloptStruct(sizeof(IvfflatOptions), options, numoptions); + fillRelOptions((void *) rdopts, sizeof(IvfflatOptions), options, numoptions, + validate, tab, lengthof(tab)); + + return (bytea *) rdopts; +#endif +} + +/* + * Validate catalog entries for the specified operator class + */ +static bool +ivfflatvalidate(Oid opclassoid) +{ + return true; +} + +/* + * Define index handler + * + * See https://www.postgresql.org/docs/current/index-api.html + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(ivfflathandler); +Datum +ivfflathandler(PG_FUNCTION_ARGS) +{ + IndexAmRoutine *amroutine = makeNode(IndexAmRoutine); + + amroutine->amstrategies = 0; + amroutine->amsupport = 4; +#if PG_VERSION_NUM >= 130000 + amroutine->amoptsprocnum = 0; +#endif + amroutine->amcanorder = false; + amroutine->amcanorderbyop = true; + amroutine->amcanbackward = false; /* can change direction mid-scan */ + amroutine->amcanunique = false; + amroutine->amcanmulticol = false; + amroutine->amoptionalkey = true; + amroutine->amsearcharray = false; + amroutine->amsearchnulls = false; + amroutine->amstorage = false; + amroutine->amclusterable = false; + amroutine->ampredlocks = false; + amroutine->amcanparallel = false; + amroutine->amcaninclude = false; +#if PG_VERSION_NUM >= 130000 + amroutine->amusemaintenanceworkmem = false; /* not used during VACUUM */ + amroutine->amparallelvacuumoptions = VACUUM_OPTION_PARALLEL_BULKDEL; +#endif + amroutine->amkeytype = InvalidOid; + + /* Interface functions */ + amroutine->ambuild = ivfflatbuild; + amroutine->ambuildempty = ivfflatbuildempty; + amroutine->aminsert = ivfflatinsert; + amroutine->ambulkdelete = ivfflatbulkdelete; + amroutine->amvacuumcleanup = ivfflatvacuumcleanup; + amroutine->amcanreturn = NULL; /* tuple not included in heapsort */ + amroutine->amcostestimate = ivfflatcostestimate; + amroutine->amoptions = ivfflatoptions; + amroutine->amproperty = NULL; /* TODO AMPROP_DISTANCE_ORDERABLE */ +#if PG_VERSION_NUM >= 120000 + amroutine->ambuildphasename = ivfflatbuildphasename; +#endif + amroutine->amvalidate = ivfflatvalidate; +#if PG_VERSION_NUM >= 140000 + amroutine->amadjustmembers = NULL; +#endif + amroutine->ambeginscan = ivfflatbeginscan; + amroutine->amrescan = ivfflatrescan; + amroutine->amgettuple = ivfflatgettuple; + amroutine->amgetbitmap = NULL; + amroutine->amendscan = ivfflatendscan; + amroutine->ammarkpos = NULL; + amroutine->amrestrpos = NULL; + + /* Interface functions to support parallel index scans */ + amroutine->amestimateparallelscan = NULL; + amroutine->aminitparallelscan = NULL; + amroutine->amparallelrescan = NULL; + + PG_RETURN_POINTER(amroutine); +} diff --git a/contrib/pgvector/src/ivfflat.h b/contrib/pgvector/src/ivfflat.h new file mode 100644 index 0000000000000000000000000000000000000000..5bd7622caa1d1a28e7fe9c330c07e3884d8155dd --- /dev/null +++ b/contrib/pgvector/src/ivfflat.h @@ -0,0 +1,250 @@ +#ifndef IVFFLAT_H +#define IVFFLAT_H + +#include "postgres.h" + +#if PG_VERSION_NUM < 110000 +#error "Requires PostgreSQL 11+" +#endif + +#include "access/generic_xlog.h" +#include "access/reloptions.h" +#include "nodes/execnodes.h" +#include "port.h" /* for strtof() and random() */ +#include "utils/sampling.h" +#include "utils/tuplesort.h" +#include "vector.h" + +#if PG_VERSION_NUM >= 150000 +#include "common/pg_prng.h" +#endif + +#ifdef IVFFLAT_BENCH +#include "portability/instr_time.h" +#endif + +#define IVFFLAT_MAX_DIM 2000 + +/* Support functions */ +#define IVFFLAT_DISTANCE_PROC 1 +#define IVFFLAT_NORM_PROC 2 +#define IVFFLAT_KMEANS_DISTANCE_PROC 3 +#define IVFFLAT_KMEANS_NORM_PROC 4 + +#define IVFFLAT_VERSION 1 +#define IVFFLAT_MAGIC_NUMBER 0x14FF1A7 +#define IVFFLAT_PAGE_ID 0xFF84 + +/* Preserved page numbers */ +#define IVFFLAT_METAPAGE_BLKNO 0 +#define IVFFLAT_HEAD_BLKNO 1 /* first list page */ + +#define IVFFLAT_DEFAULT_LISTS 100 +#define IVFFLAT_MAX_LISTS 32768 + +/* Build phases */ +/* PROGRESS_CREATEIDX_SUBPHASE_INITIALIZE is 1 */ +#define PROGRESS_IVFFLAT_PHASE_KMEANS 2 +#define PROGRESS_IVFFLAT_PHASE_SORT 3 +#define PROGRESS_IVFFLAT_PHASE_LOAD 4 + +#define IVFFLAT_LIST_SIZE(_dim) (offsetof(IvfflatListData, center) + VECTOR_SIZE(_dim)) + +#define IvfflatPageGetOpaque(page) ((IvfflatPageOpaque) PageGetSpecialPointer(page)) +#define IvfflatPageGetMeta(page) ((IvfflatMetaPageData *) PageGetContents(page)) + +#ifdef IVFFLAT_BENCH +#define IvfflatBench(name, code) \ + do { \ + instr_time start; \ + instr_time duration; \ + INSTR_TIME_SET_CURRENT(start); \ + (code); \ + INSTR_TIME_SET_CURRENT(duration); \ + INSTR_TIME_SUBTRACT(duration, start); \ + elog(INFO, "%s: %.3f ms", name, INSTR_TIME_GET_MILLISEC(duration)); \ + } while (0) +#else +#define IvfflatBench(name, code) (code) +#endif + +#if PG_VERSION_NUM >= 150000 +#define RandomDouble() pg_prng_double(&pg_global_prng_state) +#define RandomInt() pg_prng_uint32(&pg_global_prng_state) +#else +#define RandomDouble() (((double) random()) / MAX_RANDOM_VALUE) +#define RandomInt() random() +#endif + +/* Variables */ +extern int ivfflat_probes; + +/* Exported functions */ +PGDLLEXPORT void _PG_init(void); + +typedef struct VectorArrayData +{ + int length; + int maxlen; + int dim; + Vector *items; +} VectorArrayData; + +typedef VectorArrayData * VectorArray; + +typedef struct ListInfo +{ + BlockNumber blkno; + OffsetNumber offno; +} ListInfo; + +/* IVFFlat index options */ +typedef struct IvfflatOptions +{ + int32 vl_len_; /* varlena header (do not touch directly!) */ + int lists; /* number of lists */ +} IvfflatOptions; + +typedef struct IvfflatBuildState +{ + /* Info */ + Relation heap; + Relation index; + IndexInfo *indexInfo; + + /* Settings */ + int dimensions; + int lists; + + /* Statistics */ + double indtuples; + double reltuples; + + /* Support functions */ + FmgrInfo *procinfo; + FmgrInfo *normprocinfo; + FmgrInfo *kmeansnormprocinfo; + Oid collation; + + /* Variables */ + VectorArray samples; + VectorArray centers; + ListInfo *listInfo; + Vector *normvec; + +#ifdef IVFFLAT_KMEANS_DEBUG + double inertia; + double *listSums; + int *listCounts; +#endif + + /* Sampling */ + BlockSamplerData bs; + ReservoirStateData rstate; + int rowstoskip; + + /* Sorting */ + Tuplesortstate *sortstate; + TupleDesc tupdesc; + TupleTableSlot *slot; + + /* Memory */ + MemoryContext tmpCtx; +} IvfflatBuildState; + +typedef struct IvfflatMetaPageData +{ + uint32 magicNumber; + uint32 version; + uint16 dimensions; + uint16 lists; +} IvfflatMetaPageData; + +typedef IvfflatMetaPageData * IvfflatMetaPage; + +typedef struct IvfflatPageOpaqueData +{ + BlockNumber nextblkno; + uint16 unused; + uint16 page_id; /* for identification of IVFFlat indexes */ +} IvfflatPageOpaqueData; + +typedef IvfflatPageOpaqueData * IvfflatPageOpaque; + +typedef struct IvfflatListData +{ + BlockNumber startPage; + BlockNumber insertPage; + Vector center; +} IvfflatListData; + +typedef IvfflatListData * IvfflatList; + +typedef struct IvfflatScanList +{ + pairingheap_node ph_node; + BlockNumber startPage; + double distance; +} IvfflatScanList; + +typedef struct IvfflatScanOpaqueData +{ + int probes; + bool first; + Buffer buf; + + /* Sorting */ + Tuplesortstate *sortstate; + TupleDesc tupdesc; + TupleTableSlot *slot; + bool isnull; + + /* Support functions */ + FmgrInfo *procinfo; + FmgrInfo *normprocinfo; + Oid collation; + + /* Lists */ + pairingheap *listQueue; + IvfflatScanList lists[FLEXIBLE_ARRAY_MEMBER]; /* must come last */ +} IvfflatScanOpaqueData; + +typedef IvfflatScanOpaqueData * IvfflatScanOpaque; + +#define VECTOR_ARRAY_SIZE(_length, _dim) (sizeof(VectorArrayData) + (_length) * VECTOR_SIZE(_dim)) +#define VECTOR_ARRAY_OFFSET(_arr, _offset) ((char*) (_arr)->items + (_offset) * VECTOR_SIZE((_arr)->dim)) +#define VectorArrayGet(_arr, _offset) ((Vector *) VECTOR_ARRAY_OFFSET(_arr, _offset)) +#define VectorArraySet(_arr, _offset, _val) memcpy(VECTOR_ARRAY_OFFSET(_arr, _offset), _val, VECTOR_SIZE((_arr)->dim)) + +/* Methods */ +VectorArray VectorArrayInit(int maxlen, int dimensions); +void VectorArrayFree(VectorArray arr); +void PrintVectorArray(char *msg, VectorArray arr); +void IvfflatKmeans(Relation index, VectorArray samples, VectorArray centers); +FmgrInfo *IvfflatOptionalProcInfo(Relation rel, uint16 procnum); +bool IvfflatNormValue(FmgrInfo *procinfo, Oid collation, Datum *value, Vector * result); +int IvfflatGetLists(Relation index); +void IvfflatUpdateList(Relation index, GenericXLogState *state, ListInfo listInfo, BlockNumber insertPage, BlockNumber originalInsertPage, BlockNumber startPage, ForkNumber forkNum); +void IvfflatCommitBuffer(Buffer buf, GenericXLogState *state); +void IvfflatAppendPage(Relation index, Buffer *buf, Page *page, GenericXLogState **state, ForkNumber forkNum); +Buffer IvfflatNewBuffer(Relation index, ForkNumber forkNum); +void IvfflatInitPage(Buffer buf, Page page); +void IvfflatInitRegisterPage(Relation index, Buffer *buf, Page *page, GenericXLogState **state); + +/* Index access methods */ +IndexBuildResult *ivfflatbuild(Relation heap, Relation index, IndexInfo *indexInfo); +void ivfflatbuildempty(Relation index); +bool ivfflatinsert(Relation index, Datum *values, bool *isnull, ItemPointer heap_tid, Relation heap, IndexUniqueCheck checkUnique +#if PG_VERSION_NUM >= 140000 + ,bool indexUnchanged +#endif + ,IndexInfo *indexInfo +); +IndexBulkDeleteResult *ivfflatbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state); +IndexBulkDeleteResult *ivfflatvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats); +IndexScanDesc ivfflatbeginscan(Relation index, int nkeys, int norderbys); +void ivfflatrescan(IndexScanDesc scan, ScanKey keys, int nkeys, ScanKey orderbys, int norderbys); +bool ivfflatgettuple(IndexScanDesc scan, ScanDirection dir); +void ivfflatendscan(IndexScanDesc scan); + +#endif diff --git a/contrib/pgvector/src/ivfinsert.c b/contrib/pgvector/src/ivfinsert.c new file mode 100644 index 0000000000000000000000000000000000000000..8761f6a564808e9b91e11c3eb69e5fdb46da193b --- /dev/null +++ b/contrib/pgvector/src/ivfinsert.c @@ -0,0 +1,215 @@ +#include "postgres.h" + +#include + +#include "ivfflat.h" +#include "storage/bufmgr.h" +#include "utils/memutils.h" + +/* + * Find the list that minimizes the distance function + */ +static void +FindInsertPage(Relation rel, Datum *values, BlockNumber *insertPage, ListInfo * listInfo) +{ + Buffer cbuf; + Page cpage; + IvfflatList list; + double distance; + double minDistance = DBL_MAX; + BlockNumber nextblkno = IVFFLAT_HEAD_BLKNO; + FmgrInfo *procinfo; + Oid collation; + OffsetNumber offno; + OffsetNumber maxoffno; + + /* Avoid compiler warning */ + listInfo->blkno = nextblkno; + listInfo->offno = FirstOffsetNumber; + + procinfo = index_getprocinfo(rel, 1, IVFFLAT_DISTANCE_PROC); + collation = rel->rd_indcollation[0]; + + /* Search all list pages */ + while (BlockNumberIsValid(nextblkno)) + { + cbuf = ReadBuffer(rel, nextblkno); + LockBuffer(cbuf, BUFFER_LOCK_SHARE); + cpage = BufferGetPage(cbuf); + maxoffno = PageGetMaxOffsetNumber(cpage); + + for (offno = FirstOffsetNumber; offno <= maxoffno; offno = OffsetNumberNext(offno)) + { + list = (IvfflatList) PageGetItem(cpage, PageGetItemId(cpage, offno)); + distance = DatumGetFloat8(FunctionCall2Coll(procinfo, collation, values[0], PointerGetDatum(&list->center))); + + if (distance < minDistance || !BlockNumberIsValid(*insertPage)) + { + *insertPage = list->insertPage; + listInfo->blkno = nextblkno; + listInfo->offno = offno; + minDistance = distance; + } + } + + nextblkno = IvfflatPageGetOpaque(cpage)->nextblkno; + + UnlockReleaseBuffer(cbuf); + } +} + +/* + * Insert a tuple into the index + */ +static void +InsertTuple(Relation rel, Datum *values, bool *isnull, ItemPointer heap_tid, Relation heapRel) +{ + IndexTuple itup; + Datum value; + FmgrInfo *normprocinfo; + Buffer buf; + Page page; + GenericXLogState *state; + Size itemsz; + BlockNumber insertPage = InvalidBlockNumber; + ListInfo listInfo; + BlockNumber originalInsertPage; + + /* Detoast once for all calls */ + value = PointerGetDatum(PG_DETOAST_DATUM(values[0])); + + /* Normalize if needed */ + normprocinfo = IvfflatOptionalProcInfo(rel, IVFFLAT_NORM_PROC); + if (normprocinfo != NULL) + { + if (!IvfflatNormValue(normprocinfo, rel->rd_indcollation[0], &value, NULL)) + return; + } + + /* Find the insert page - sets the page and list info */ + FindInsertPage(rel, values, &insertPage, &listInfo); + Assert(BlockNumberIsValid(insertPage)); + originalInsertPage = insertPage; + + /* Form tuple */ + itup = index_form_tuple(RelationGetDescr(rel), &value, isnull); + itup->t_tid = *heap_tid; + + /* Get tuple size */ + itemsz = MAXALIGN(IndexTupleSize(itup)); + Assert(itemsz <= BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - MAXALIGN(sizeof(IvfflatPageOpaqueData))); + + /* Find a page to insert the item */ + for (;;) + { + buf = ReadBuffer(rel, insertPage); + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + state = GenericXLogStart(rel); + page = GenericXLogRegisterBuffer(state, buf, 0); + + if (PageGetFreeSpace(page) >= itemsz) + break; + + insertPage = IvfflatPageGetOpaque(page)->nextblkno; + + if (BlockNumberIsValid(insertPage)) + { + /* Move to next page */ + GenericXLogAbort(state); + UnlockReleaseBuffer(buf); + } + else + { + Buffer metabuf; + Buffer newbuf; + Page newpage; + + /* + * From ReadBufferExtended: Caller is responsible for ensuring + * that only one backend tries to extend a relation at the same + * time! + */ + metabuf = ReadBuffer(rel, IVFFLAT_METAPAGE_BLKNO); + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + + /* Add a new page */ + newbuf = IvfflatNewBuffer(rel, MAIN_FORKNUM); + newpage = GenericXLogRegisterBuffer(state, newbuf, GENERIC_XLOG_FULL_IMAGE); + + /* Init new page */ + IvfflatInitPage(newbuf, newpage); + + /* Update insert page */ + insertPage = BufferGetBlockNumber(newbuf); + + /* Update previous buffer */ + IvfflatPageGetOpaque(page)->nextblkno = insertPage; + + /* Commit */ + MarkBufferDirty(newbuf); + MarkBufferDirty(buf); + GenericXLogFinish(state); + + /* Unlock extend relation lock as early as possible */ + UnlockReleaseBuffer(metabuf); + + /* Unlock previous buffer */ + UnlockReleaseBuffer(buf); + + /* Prepare new buffer */ + state = GenericXLogStart(rel); + buf = newbuf; + page = GenericXLogRegisterBuffer(state, buf, 0); + break; + } + } + + /* Add to next offset */ + if (PageAddItem(page, (Item) itup, itemsz, InvalidOffsetNumber, false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add index item to \"%s\"", RelationGetRelationName(rel)); + + IvfflatCommitBuffer(buf, state); + + /* Update the insert page */ + if (insertPage != originalInsertPage) + IvfflatUpdateList(rel, state, listInfo, insertPage, originalInsertPage, InvalidBlockNumber, MAIN_FORKNUM); +} + +/* + * Insert a tuple into the index + */ +bool +ivfflatinsert(Relation index, Datum *values, bool *isnull, ItemPointer heap_tid, + Relation heap, IndexUniqueCheck checkUnique +#if PG_VERSION_NUM >= 140000 + ,bool indexUnchanged +#endif + ,IndexInfo *indexInfo +) +{ + MemoryContext oldCtx; + MemoryContext insertCtx; + + /* Skip nulls */ + if (isnull[0]) + return false; + + /* + * Use memory context since detoast, IvfflatNormValue, and + * index_form_tuple can allocate + */ + insertCtx = AllocSetContextCreate(CurrentMemoryContext, + "Ivfflat insert temporary context", + ALLOCSET_DEFAULT_SIZES); + oldCtx = MemoryContextSwitchTo(insertCtx); + + /* Insert tuple */ + InsertTuple(index, values, isnull, heap_tid, heap); + + /* Delete memory context */ + MemoryContextSwitchTo(oldCtx); + MemoryContextDelete(insertCtx); + + return false; +} diff --git a/contrib/pgvector/src/ivfkmeans.c b/contrib/pgvector/src/ivfkmeans.c new file mode 100644 index 0000000000000000000000000000000000000000..eb94de0e3cfbb8d7636ddca9a4e7dbc906907699 --- /dev/null +++ b/contrib/pgvector/src/ivfkmeans.c @@ -0,0 +1,534 @@ +#include "postgres.h" + +#include +#include + +#include "ivfflat.h" +#include "miscadmin.h" + +/* + * Initialize with kmeans++ + * + * https://theory.stanford.edu/~sergei/papers/kMeansPP-soda.pdf + */ +static void +InitCenters(Relation index, VectorArray samples, VectorArray centers, float *lowerBound) +{ + FmgrInfo *procinfo; + Oid collation; + int i; + int64 j; + double distance; + double sum; + double choice; + Vector *vec; + float *weight = palloc(samples->length * sizeof(float)); + int numCenters = centers->maxlen; + int numSamples = samples->length; + + procinfo = index_getprocinfo(index, 1, IVFFLAT_KMEANS_DISTANCE_PROC); + collation = index->rd_indcollation[0]; + + /* Choose an initial center uniformly at random */ + VectorArraySet(centers, 0, VectorArrayGet(samples, RandomInt() % samples->length)); + centers->length++; + + for (j = 0; j < numSamples; j++) + weight[j] = DBL_MAX; + + for (i = 0; i < numCenters; i++) + { + CHECK_FOR_INTERRUPTS(); + + sum = 0.0; + + for (j = 0; j < numSamples; j++) + { + vec = VectorArrayGet(samples, j); + + /* Only need to compute distance for new center */ + /* TODO Use triangle inequality to reduce distance calculations */ + distance = DatumGetFloat8(FunctionCall2Coll(procinfo, collation, PointerGetDatum(vec), PointerGetDatum(VectorArrayGet(centers, i)))); + + /* Set lower bound */ + lowerBound[j * numCenters + i] = distance; + + /* Use distance squared for weighted probability distribution */ + distance *= distance; + + if (distance < weight[j]) + weight[j] = distance; + + sum += weight[j]; + } + + /* Only compute lower bound on last iteration */ + if (i + 1 == numCenters) + break; + + /* Choose new center using weighted probability distribution. */ + choice = sum * RandomDouble(); + for (j = 0; j < numSamples - 1; j++) + { + choice -= weight[j]; + if (choice <= 0) + break; + } + + VectorArraySet(centers, i + 1, VectorArrayGet(samples, j)); + centers->length++; + } + + pfree(weight); +} + +/* + * Apply norm to vector + */ +static inline void +ApplyNorm(FmgrInfo *normprocinfo, Oid collation, Vector * vec) +{ + int i; + double norm = DatumGetFloat8(FunctionCall1Coll(normprocinfo, collation, PointerGetDatum(vec))); + + /* TODO Handle zero norm */ + if (norm > 0) + { + for (i = 0; i < vec->dim; i++) + vec->x[i] /= norm; + } +} + +/* + * Compare vectors + */ +static int +CompareVectors(const void *a, const void *b) +{ + return vector_cmp_internal((Vector *) a, (Vector *) b); +} + +/* + * Quick approach if we have little data + */ +static void +QuickCenters(Relation index, VectorArray samples, VectorArray centers) +{ + int i; + int j; + Vector *vec; + int dimensions = centers->dim; + Oid collation = index->rd_indcollation[0]; + FmgrInfo *normprocinfo = IvfflatOptionalProcInfo(index, IVFFLAT_KMEANS_NORM_PROC); + + /* Copy existing vectors while avoiding duplicates */ + if (samples->length > 0) + { + qsort(samples->items, samples->length, VECTOR_SIZE(samples->dim), CompareVectors); + for (i = 0; i < samples->length; i++) + { + vec = VectorArrayGet(samples, i); + + if (i == 0 || CompareVectors(vec, VectorArrayGet(samples, i - 1)) != 0) + { + VectorArraySet(centers, centers->length, vec); + centers->length++; + } + } + } + + /* Fill remaining with random data */ + while (centers->length < centers->maxlen) + { + vec = VectorArrayGet(centers, centers->length); + + SET_VARSIZE(vec, VECTOR_SIZE(dimensions)); + vec->dim = dimensions; + + for (j = 0; j < dimensions; j++) + vec->x[j] = RandomDouble(); + + /* Normalize if needed (only needed for random centers) */ + if (normprocinfo != NULL) + ApplyNorm(normprocinfo, collation, vec); + + centers->length++; + } +} + +/* + * Use Elkan for performance. This requires distance function to satisfy triangle inequality. + * + * We use L2 distance for L2 (not L2 squared like index scan) + * and angular distance for inner product and cosine distance + * + * https://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf + */ +static void +ElkanKmeans(Relation index, VectorArray samples, VectorArray centers) +{ + FmgrInfo *procinfo; + FmgrInfo *normprocinfo; + Oid collation; + Vector *vec; + Vector *newCenter; + int iteration; + int64 j; + int64 k; + int dimensions = centers->dim; + int numCenters = centers->maxlen; + int numSamples = samples->length; + VectorArray newCenters; + int *centerCounts; + int *closestCenters; + float *lowerBound; + float *upperBound; + float *s; + float *halfcdist; + float *newcdist; + int changes; + double minDistance; + int closestCenter; + double distance; + bool rj; + bool rjreset; + double dxcx; + double dxc; + + /* Calculate allocation sizes */ + Size samplesSize = VECTOR_ARRAY_SIZE(samples->maxlen, samples->dim); + Size centersSize = VECTOR_ARRAY_SIZE(centers->maxlen, centers->dim); + Size newCentersSize = VECTOR_ARRAY_SIZE(numCenters, dimensions); + Size centerCountsSize = sizeof(int) * numCenters; + Size closestCentersSize = sizeof(int) * numSamples; + Size lowerBoundSize = sizeof(float) * numSamples * numCenters; + Size upperBoundSize = sizeof(float) * numSamples; + Size sSize = sizeof(float) * numCenters; + Size halfcdistSize = sizeof(float) * numCenters * numCenters; + Size newcdistSize = sizeof(float) * numCenters; + + /* Calculate total size */ + Size totalSize = samplesSize + centersSize + newCentersSize + centerCountsSize + closestCentersSize + lowerBoundSize + upperBoundSize + sSize + halfcdistSize + newcdistSize; + + /* Check memory requirements */ + /* Add one to error message to ceil */ + if (totalSize > (Size) maintenance_work_mem * 1024L) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("memory required is %zu MB, maintenance_work_mem is %d MB", + totalSize / (1024 * 1024) + 1, maintenance_work_mem / 1024))); + + /* Ensure indexing does not overflow */ + if (numCenters * numCenters > INT_MAX) + elog(ERROR, "Indexing overflow detected. Please report a bug."); + + /* Set support functions */ + procinfo = index_getprocinfo(index, 1, IVFFLAT_KMEANS_DISTANCE_PROC); + normprocinfo = IvfflatOptionalProcInfo(index, IVFFLAT_KMEANS_NORM_PROC); + collation = index->rd_indcollation[0]; + + /* Allocate space */ + /* Use float instead of double to save memory */ + centerCounts = palloc(centerCountsSize); + closestCenters = palloc(closestCentersSize); + lowerBound = palloc_extended(lowerBoundSize, MCXT_ALLOC_HUGE); + upperBound = palloc(upperBoundSize); + s = palloc(sSize); + halfcdist = palloc_extended(halfcdistSize, MCXT_ALLOC_HUGE); + newcdist = palloc(newcdistSize); + + newCenters = VectorArrayInit(numCenters, dimensions); + for (j = 0; j < numCenters; j++) + { + vec = VectorArrayGet(newCenters, j); + SET_VARSIZE(vec, VECTOR_SIZE(dimensions)); + vec->dim = dimensions; + } + + /* Pick initial centers */ + InitCenters(index, samples, centers, lowerBound); + + /* Assign each x to its closest initial center c(x) = argmin d(x,c) */ + for (j = 0; j < numSamples; j++) + { + minDistance = DBL_MAX; + closestCenter = 0; + + /* Find closest center */ + for (k = 0; k < numCenters; k++) + { + /* TODO Use Lemma 1 in k-means++ initialization */ + distance = lowerBound[j * numCenters + k]; + + if (distance < minDistance) + { + minDistance = distance; + closestCenter = k; + } + } + + upperBound[j] = minDistance; + closestCenters[j] = closestCenter; + } + + /* Give 500 iterations to converge */ + for (iteration = 0; iteration < 500; iteration++) + { + /* Can take a while, so ensure we can interrupt */ + CHECK_FOR_INTERRUPTS(); + + changes = 0; + + /* Step 1: For all centers, compute distance */ + for (j = 0; j < numCenters; j++) + { + vec = VectorArrayGet(centers, j); + + for (k = j + 1; k < numCenters; k++) + { + distance = 0.5 * DatumGetFloat8(FunctionCall2Coll(procinfo, collation, PointerGetDatum(vec), PointerGetDatum(VectorArrayGet(centers, k)))); + halfcdist[j * numCenters + k] = distance; + halfcdist[k * numCenters + j] = distance; + } + } + + /* For all centers c, compute s(c) */ + for (j = 0; j < numCenters; j++) + { + minDistance = DBL_MAX; + + for (k = 0; k < numCenters; k++) + { + if (j == k) + continue; + + distance = halfcdist[j * numCenters + k]; + if (distance < minDistance) + minDistance = distance; + } + + s[j] = minDistance; + } + + rjreset = iteration != 0; + + for (j = 0; j < numSamples; j++) + { + /* Step 2: Identify all points x such that u(x) <= s(c(x)) */ + if (upperBound[j] <= s[closestCenters[j]]) + continue; + + rj = rjreset; + + for (k = 0; k < numCenters; k++) + { + /* Step 3: For all remaining points x and centers c */ + if (k == closestCenters[j]) + continue; + + if (upperBound[j] <= lowerBound[j * numCenters + k]) + continue; + + if (upperBound[j] <= halfcdist[closestCenters[j] * numCenters + k]) + continue; + + vec = VectorArrayGet(samples, j); + + /* Step 3a */ + if (rj) + { + dxcx = DatumGetFloat8(FunctionCall2Coll(procinfo, collation, PointerGetDatum(vec), PointerGetDatum(VectorArrayGet(centers, closestCenters[j])))); + + /* d(x,c(x)) computed, which is a form of d(x,c) */ + lowerBound[j * numCenters + closestCenters[j]] = dxcx; + upperBound[j] = dxcx; + + rj = false; + } + else + dxcx = upperBound[j]; + + /* Step 3b */ + if (dxcx > lowerBound[j * numCenters + k] || dxcx > halfcdist[closestCenters[j] * numCenters + k]) + { + dxc = DatumGetFloat8(FunctionCall2Coll(procinfo, collation, PointerGetDatum(vec), PointerGetDatum(VectorArrayGet(centers, k)))); + + /* d(x,c) calculated */ + lowerBound[j * numCenters + k] = dxc; + + if (dxc < dxcx) + { + closestCenters[j] = k; + + /* c(x) changed */ + upperBound[j] = dxc; + + changes++; + } + + } + } + } + + /* Step 4: For each center c, let m(c) be mean of all points assigned */ + for (j = 0; j < numCenters; j++) + { + vec = VectorArrayGet(newCenters, j); + for (k = 0; k < dimensions; k++) + vec->x[k] = 0.0; + + centerCounts[j] = 0; + } + + for (j = 0; j < numSamples; j++) + { + vec = VectorArrayGet(samples, j); + closestCenter = closestCenters[j]; + + /* Increment sum and count of closest center */ + newCenter = VectorArrayGet(newCenters, closestCenter); + for (k = 0; k < dimensions; k++) + newCenter->x[k] += vec->x[k]; + + centerCounts[closestCenter] += 1; + } + + for (j = 0; j < numCenters; j++) + { + vec = VectorArrayGet(newCenters, j); + + if (centerCounts[j] > 0) + { + /* Double avoids overflow, but requires more memory */ + /* TODO Update bounds */ + for (k = 0; k < dimensions; k++) + { + if (isinf(vec->x[k])) + vec->x[k] = vec->x[k] > 0 ? FLT_MAX : -FLT_MAX; + } + + for (k = 0; k < dimensions; k++) + vec->x[k] /= centerCounts[j]; + } + else + { + /* TODO Handle empty centers properly */ + for (k = 0; k < dimensions; k++) + vec->x[k] = RandomDouble(); + } + + /* Normalize if needed */ + if (normprocinfo != NULL) + ApplyNorm(normprocinfo, collation, vec); + } + + /* Step 5 */ + for (j = 0; j < numCenters; j++) + newcdist[j] = DatumGetFloat8(FunctionCall2Coll(procinfo, collation, PointerGetDatum(VectorArrayGet(centers, j)), PointerGetDatum(VectorArrayGet(newCenters, j)))); + + for (j = 0; j < numSamples; j++) + { + for (k = 0; k < numCenters; k++) + { + distance = lowerBound[j * numCenters + k] - newcdist[k]; + + if (distance < 0) + distance = 0; + + lowerBound[j * numCenters + k] = distance; + } + } + + /* Step 6 */ + /* We reset r(x) before Step 3 in the next iteration */ + for (j = 0; j < numSamples; j++) + upperBound[j] += newcdist[closestCenters[j]]; + + /* Step 7 */ + for (j = 0; j < numCenters; j++) + memcpy(VectorArrayGet(centers, j), VectorArrayGet(newCenters, j), VECTOR_SIZE(dimensions)); + + if (changes == 0 && iteration != 0) + break; + } + + VectorArrayFree(newCenters); + pfree(centerCounts); + pfree(closestCenters); + pfree(lowerBound); + pfree(upperBound); + pfree(s); + pfree(halfcdist); + pfree(newcdist); +} + +/* + * Detect issues with centers + */ +static void +CheckCenters(Relation index, VectorArray centers) +{ + FmgrInfo *normprocinfo; + Oid collation; + Vector *vec; + int i; + int j; + double norm; + + if (centers->length != centers->maxlen) + elog(ERROR, "Not enough centers. Please report a bug."); + + /* Ensure no NaN or infinite values */ + for (i = 0; i < centers->length; i++) + { + vec = VectorArrayGet(centers, i); + + for (j = 0; j < vec->dim; j++) + { + if (isnan(vec->x[j])) + elog(ERROR, "NaN detected. Please report a bug."); + + if (isinf(vec->x[j])) + elog(ERROR, "Infinite value detected. Please report a bug."); + } + } + + /* Ensure no duplicate centers */ + /* Fine to sort in-place */ + qsort(centers->items, centers->length, VECTOR_SIZE(centers->dim), CompareVectors); + for (i = 1; i < centers->length; i++) + { + if (CompareVectors(VectorArrayGet(centers, i), VectorArrayGet(centers, i - 1)) == 0) + elog(ERROR, "Duplicate centers detected. Please report a bug."); + } + + /* Ensure no zero vectors for cosine distance */ + /* Check NORM_PROC instead of KMEANS_NORM_PROC */ + normprocinfo = IvfflatOptionalProcInfo(index, IVFFLAT_NORM_PROC); + if (normprocinfo != NULL) + { + collation = index->rd_indcollation[0]; + + for (i = 0; i < centers->length; i++) + { + norm = DatumGetFloat8(FunctionCall1Coll(normprocinfo, collation, PointerGetDatum(VectorArrayGet(centers, i)))); + if (norm == 0) + elog(ERROR, "Zero norm detected. Please report a bug."); + } + } +} + +/* + * Perform naive k-means centering + * We use spherical k-means for inner product and cosine + */ +void +IvfflatKmeans(Relation index, VectorArray samples, VectorArray centers) +{ + if (samples->length <= centers->maxlen) + QuickCenters(index, samples, centers); + else + ElkanKmeans(index, samples, centers); + + CheckCenters(index, centers); +} diff --git a/contrib/pgvector/src/ivfscan.c b/contrib/pgvector/src/ivfscan.c new file mode 100644 index 0000000000000000000000000000000000000000..fa3961b03e41c363fdf517c24246cee21a1b757f --- /dev/null +++ b/contrib/pgvector/src/ivfscan.c @@ -0,0 +1,362 @@ +#include "postgres.h" + +#include + +#include "access/relscan.h" +#include "ivfflat.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/bufmgr.h" + +#include "catalog/pg_operator_d.h" +#include "catalog/pg_type_d.h" + +/* + * Compare list distances + */ +static int +CompareLists(const pairingheap_node *a, const pairingheap_node *b, void *arg) +{ + if (((const IvfflatScanList *) a)->distance > ((const IvfflatScanList *) b)->distance) + return 1; + + if (((const IvfflatScanList *) a)->distance < ((const IvfflatScanList *) b)->distance) + return -1; + + return 0; +} + +/* + * Get lists and sort by distance + */ +static void +GetScanLists(IndexScanDesc scan, Datum value) +{ + Buffer cbuf; + Page cpage; + IvfflatList list; + OffsetNumber offno; + OffsetNumber maxoffno; + BlockNumber nextblkno = IVFFLAT_HEAD_BLKNO; + int listCount = 0; + IvfflatScanOpaque so = (IvfflatScanOpaque) scan->opaque; + double distance; + IvfflatScanList *scanlist; + double maxDistance = DBL_MAX; + + /* Search all list pages */ + while (BlockNumberIsValid(nextblkno)) + { + cbuf = ReadBuffer(scan->indexRelation, nextblkno); + LockBuffer(cbuf, BUFFER_LOCK_SHARE); + cpage = BufferGetPage(cbuf); + + maxoffno = PageGetMaxOffsetNumber(cpage); + + for (offno = FirstOffsetNumber; offno <= maxoffno; offno = OffsetNumberNext(offno)) + { + list = (IvfflatList) PageGetItem(cpage, PageGetItemId(cpage, offno)); + + /* Use procinfo from the index instead of scan key for performance */ + distance = DatumGetFloat8(FunctionCall2Coll(so->procinfo, so->collation, PointerGetDatum(&list->center), value)); + + if (listCount < so->probes) + { + scanlist = &so->lists[listCount]; + scanlist->startPage = list->startPage; + scanlist->distance = distance; + listCount++; + + /* Add to heap */ + pairingheap_add(so->listQueue, &scanlist->ph_node); + + /* Calculate max distance */ + if (listCount == so->probes) + maxDistance = ((IvfflatScanList *) pairingheap_first(so->listQueue))->distance; + } + else if (distance < maxDistance) + { + /* Remove */ + scanlist = (IvfflatScanList *) pairingheap_remove_first(so->listQueue); + + /* Reuse */ + scanlist->startPage = list->startPage; + scanlist->distance = distance; + pairingheap_add(so->listQueue, &scanlist->ph_node); + + /* Update max distance */ + maxDistance = ((IvfflatScanList *) pairingheap_first(so->listQueue))->distance; + } + } + + nextblkno = IvfflatPageGetOpaque(cpage)->nextblkno; + + UnlockReleaseBuffer(cbuf); + } +} + +/* + * Get items + */ +static void +GetScanItems(IndexScanDesc scan, Datum value) +{ + IvfflatScanOpaque so = (IvfflatScanOpaque) scan->opaque; + Buffer buf; + Page page; + IndexTuple itup; + BlockNumber searchPage; + OffsetNumber offno; + OffsetNumber maxoffno; + Datum datum; + bool isnull; + TupleDesc tupdesc = RelationGetDescr(scan->indexRelation); + double tuples = 0; + +#if PG_VERSION_NUM >= 120000 + TupleTableSlot *slot = MakeSingleTupleTableSlot(so->tupdesc, &TTSOpsVirtual); +#else + TupleTableSlot *slot = MakeSingleTupleTableSlot(so->tupdesc); +#endif + + /* + * Reuse same set of shared buffers for scan + * + * See postgres/src/backend/storage/buffer/README for description + */ + BufferAccessStrategy bas = GetAccessStrategy(BAS_BULKREAD); + + /* Search closest probes lists */ + while (!pairingheap_is_empty(so->listQueue)) + { + searchPage = ((IvfflatScanList *) pairingheap_remove_first(so->listQueue))->startPage; + + /* Search all entry pages for list */ + while (BlockNumberIsValid(searchPage)) + { + buf = ReadBufferExtended(scan->indexRelation, MAIN_FORKNUM, searchPage, RBM_NORMAL, bas); + LockBuffer(buf, BUFFER_LOCK_SHARE); + page = BufferGetPage(buf); + maxoffno = PageGetMaxOffsetNumber(page); + + for (offno = FirstOffsetNumber; offno <= maxoffno; offno = OffsetNumberNext(offno)) + { + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offno)); + datum = index_getattr(itup, 1, tupdesc, &isnull); + + /* + * Add virtual tuple + * + * Use procinfo from the index instead of scan key for + * performance + */ + ExecClearTuple(slot); + slot->tts_values[0] = FunctionCall2Coll(so->procinfo, so->collation, datum, value); + slot->tts_isnull[0] = false; + slot->tts_values[1] = PointerGetDatum(&itup->t_tid); + slot->tts_isnull[1] = false; + slot->tts_values[2] = Int32GetDatum((int) searchPage); + slot->tts_isnull[2] = false; + ExecStoreVirtualTuple(slot); + + tuplesort_puttupleslot(so->sortstate, slot); + + tuples++; + } + + searchPage = IvfflatPageGetOpaque(page)->nextblkno; + + UnlockReleaseBuffer(buf); + } + } + + FreeAccessStrategy(bas); + + /* TODO Scan more lists */ + if (tuples < 100) + ereport(DEBUG1, + (errmsg("index scan found few tuples"), + errdetail("Index may have been created with little data."), + errhint("Recreate the index and possibly decrease lists."))); + + tuplesort_performsort(so->sortstate); +} + +/* + * Prepare for an index scan + */ +IndexScanDesc +ivfflatbeginscan(Relation index, int nkeys, int norderbys) +{ + IndexScanDesc scan; + IvfflatScanOpaque so; + int lists; + AttrNumber attNums[] = {1}; + Oid sortOperators[] = {Float8LessOperator}; + Oid sortCollations[] = {InvalidOid}; + bool nullsFirstFlags[] = {false}; + int probes = ivfflat_probes; + + scan = RelationGetIndexScan(index, nkeys, norderbys); + lists = IvfflatGetLists(scan->indexRelation); + + if (probes > lists) + probes = lists; + + so = (IvfflatScanOpaque) palloc(offsetof(IvfflatScanOpaqueData, lists) + probes * sizeof(IvfflatScanList)); + so->buf = InvalidBuffer; + so->first = true; + so->probes = probes; + + /* Set support functions */ + so->procinfo = index_getprocinfo(index, 1, IVFFLAT_DISTANCE_PROC); + so->normprocinfo = IvfflatOptionalProcInfo(index, IVFFLAT_NORM_PROC); + so->collation = index->rd_indcollation[0]; + + /* Create tuple description for sorting */ +#if PG_VERSION_NUM >= 120000 + so->tupdesc = CreateTemplateTupleDesc(3); +#else + so->tupdesc = CreateTemplateTupleDesc(3, false); +#endif + TupleDescInitEntry(so->tupdesc, (AttrNumber) 1, "distance", FLOAT8OID, -1, 0); + TupleDescInitEntry(so->tupdesc, (AttrNumber) 2, "tid", TIDOID, -1, 0); + TupleDescInitEntry(so->tupdesc, (AttrNumber) 3, "indexblkno", INT4OID, -1, 0); + + /* Prep sort */ + so->sortstate = tuplesort_begin_heap(so->tupdesc, 1, attNums, sortOperators, sortCollations, nullsFirstFlags, work_mem, NULL, false); + +#if PG_VERSION_NUM >= 120000 + so->slot = MakeSingleTupleTableSlot(so->tupdesc, &TTSOpsMinimalTuple); +#else + so->slot = MakeSingleTupleTableSlot(so->tupdesc); +#endif + + so->listQueue = pairingheap_allocate(CompareLists, scan); + + scan->opaque = so; + + return scan; +} + +/* + * Start or restart an index scan + */ +void +ivfflatrescan(IndexScanDesc scan, ScanKey keys, int nkeys, ScanKey orderbys, int norderbys) +{ + IvfflatScanOpaque so = (IvfflatScanOpaque) scan->opaque; + +#if PG_VERSION_NUM >= 130000 + if (!so->first) + tuplesort_reset(so->sortstate); +#endif + + so->first = true; + pairingheap_reset(so->listQueue); + + if (keys && scan->numberOfKeys > 0) + memmove(scan->keyData, keys, scan->numberOfKeys * sizeof(ScanKeyData)); + + if (orderbys && scan->numberOfOrderBys > 0) + memmove(scan->orderByData, orderbys, scan->numberOfOrderBys * sizeof(ScanKeyData)); +} + +/* + * Fetch the next tuple in the given scan + */ +bool +ivfflatgettuple(IndexScanDesc scan, ScanDirection dir) +{ + IvfflatScanOpaque so = (IvfflatScanOpaque) scan->opaque; + + /* + * Index can be used to scan backward, but Postgres doesn't support + * backward scan on operators + */ + Assert(ScanDirectionIsForward(dir)); + + if (so->first) + { + Datum value; + + /* Count index scan for stats */ + pgstat_count_index_scan(scan->indexRelation); + + /* Safety check */ + if (scan->orderByData == NULL) + elog(ERROR, "cannot scan ivfflat index without order"); + + /* No items will match if null */ + if (scan->orderByData->sk_flags & SK_ISNULL) + return false; + + value = scan->orderByData->sk_argument; + + /* Value should not be compressed or toasted */ + Assert(!VARATT_IS_COMPRESSED(DatumGetPointer(value))); + Assert(!VARATT_IS_EXTENDED(DatumGetPointer(value))); + + if (so->normprocinfo != NULL) + { + /* No items will match if normalization fails */ + if (!IvfflatNormValue(so->normprocinfo, so->collation, &value, NULL)) + return false; + } + + IvfflatBench("GetScanLists", GetScanLists(scan, value)); + IvfflatBench("GetScanItems", GetScanItems(scan, value)); + so->first = false; + + /* Clean up if we allocated a new value */ + if (value != scan->orderByData->sk_argument) + pfree(DatumGetPointer(value)); + } + + if (tuplesort_gettupleslot(so->sortstate, true, false, so->slot, NULL)) + { + ItemPointer tid = (ItemPointer) DatumGetPointer(slot_getattr(so->slot, 2, &so->isnull)); + BlockNumber indexblkno = DatumGetInt32(slot_getattr(so->slot, 3, &so->isnull)); + +#if PG_VERSION_NUM >= 120000 + scan->xs_heaptid = *tid; +#else + scan->xs_ctup.t_self = *tid; +#endif + + if (BufferIsValid(so->buf)) + ReleaseBuffer(so->buf); + + /* + * An index scan must maintain a pin on the index page holding the + * item last returned by amgettuple + * + * https://www.postgresql.org/docs/current/index-locking.html + */ + so->buf = ReadBuffer(scan->indexRelation, indexblkno); + + scan->xs_recheckorderby = false; + return true; + } + + return false; +} + +/* + * End a scan and release resources + */ +void +ivfflatendscan(IndexScanDesc scan) +{ + IvfflatScanOpaque so = (IvfflatScanOpaque) scan->opaque; + + /* Release pin */ + if (BufferIsValid(so->buf)) + ReleaseBuffer(so->buf); + + pairingheap_free(so->listQueue); + tuplesort_end(so->sortstate); + + pfree(so); + scan->opaque = NULL; +} diff --git a/contrib/pgvector/src/ivfutils.c b/contrib/pgvector/src/ivfutils.c new file mode 100644 index 0000000000000000000000000000000000000000..7cf6fe65db5b3461a2aa32d16d1eca6cd42ad852 --- /dev/null +++ b/contrib/pgvector/src/ivfutils.c @@ -0,0 +1,225 @@ +#include "postgres.h" + +#include "ivfflat.h" +#include "storage/bufmgr.h" +#include "vector.h" + +/* + * Allocate a vector array + */ +VectorArray +VectorArrayInit(int maxlen, int dimensions) +{ + VectorArray res = palloc(sizeof(VectorArrayData)); + + res->length = 0; + res->maxlen = maxlen; + res->dim = dimensions; + res->items = palloc_extended(maxlen * VECTOR_SIZE(dimensions), MCXT_ALLOC_ZERO | MCXT_ALLOC_HUGE); + return res; +} + +/* + * Free a vector array + */ +void +VectorArrayFree(VectorArray arr) +{ + pfree(arr->items); + pfree(arr); +} + +/* + * Print vector array - useful for debugging + */ +void +PrintVectorArray(char *msg, VectorArray arr) +{ + int i; + + for (i = 0; i < arr->length; i++) + PrintVector(msg, VectorArrayGet(arr, i)); +} + +/* + * Get the number of lists in the index + */ +int +IvfflatGetLists(Relation index) +{ + IvfflatOptions *opts = (IvfflatOptions *) index->rd_options; + + if (opts) + return opts->lists; + + return IVFFLAT_DEFAULT_LISTS; +} + +/* + * Get proc + */ +FmgrInfo * +IvfflatOptionalProcInfo(Relation rel, uint16 procnum) +{ + if (!OidIsValid(index_getprocid(rel, 1, procnum))) + return NULL; + + return index_getprocinfo(rel, 1, procnum); +} + +/* + * Divide by the norm + * + * Returns false if value should not be indexed + * + * The caller needs to free the pointer stored in value + * if it's different than the original value + */ +bool +IvfflatNormValue(FmgrInfo *procinfo, Oid collation, Datum *value, Vector * result) +{ + Vector *v; + int i; + double norm; + + norm = DatumGetFloat8(FunctionCall1Coll(procinfo, collation, *value)); + + if (norm > 0) + { + v = DatumGetVector(*value); + + if (result == NULL) + result = InitVector(v->dim); + + for (i = 0; i < v->dim; i++) + result->x[i] = v->x[i] / norm; + + *value = PointerGetDatum(result); + + return true; + } + + return false; +} + +/* + * New buffer + */ +Buffer +IvfflatNewBuffer(Relation index, ForkNumber forkNum) +{ + Buffer buf = ReadBufferExtended(index, forkNum, P_NEW, RBM_NORMAL, NULL); + + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + return buf; +} + +/* + * Init page + */ +void +IvfflatInitPage(Buffer buf, Page page) +{ + PageInit(page, BufferGetPageSize(buf), sizeof(IvfflatPageOpaqueData)); + IvfflatPageGetOpaque(page)->nextblkno = InvalidBlockNumber; + IvfflatPageGetOpaque(page)->page_id = IVFFLAT_PAGE_ID; +} + +/* + * Init and register page + */ +void +IvfflatInitRegisterPage(Relation index, Buffer *buf, Page *page, GenericXLogState **state) +{ + *state = GenericXLogStart(index); + *page = GenericXLogRegisterBuffer(*state, *buf, GENERIC_XLOG_FULL_IMAGE); + IvfflatInitPage(*buf, *page); +} + +/* + * Commit buffer + */ +void +IvfflatCommitBuffer(Buffer buf, GenericXLogState *state) +{ + MarkBufferDirty(buf); + GenericXLogFinish(state); + UnlockReleaseBuffer(buf); +} + +/* + * Add a new page + * + * The order is very important!! + */ +void +IvfflatAppendPage(Relation index, Buffer *buf, Page *page, GenericXLogState **state, ForkNumber forkNum) +{ + /* Get new buffer */ + Buffer newbuf = IvfflatNewBuffer(index, forkNum); + Page newpage = GenericXLogRegisterBuffer(*state, newbuf, GENERIC_XLOG_FULL_IMAGE); + + /* Update the previous buffer */ + IvfflatPageGetOpaque(*page)->nextblkno = BufferGetBlockNumber(newbuf); + + /* Init new page */ + IvfflatInitPage(newbuf, newpage); + + /* Commit */ + MarkBufferDirty(*buf); + MarkBufferDirty(newbuf); + GenericXLogFinish(*state); + + /* Unlock */ + UnlockReleaseBuffer(*buf); + + *state = GenericXLogStart(index); + *page = GenericXLogRegisterBuffer(*state, newbuf, GENERIC_XLOG_FULL_IMAGE); + *buf = newbuf; +} + +/* + * Update the start or insert page of a list + */ +void +IvfflatUpdateList(Relation index, GenericXLogState *state, ListInfo listInfo, + BlockNumber insertPage, BlockNumber originalInsertPage, + BlockNumber startPage, ForkNumber forkNum) +{ + Buffer buf; + Page page; + IvfflatList list; + bool changed = false; + + buf = ReadBufferExtended(index, forkNum, listInfo.blkno, RBM_NORMAL, NULL); + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + state = GenericXLogStart(index); + page = GenericXLogRegisterBuffer(state, buf, 0); + list = (IvfflatList) PageGetItem(page, PageGetItemId(page, listInfo.offno)); + + if (BlockNumberIsValid(insertPage) && insertPage != list->insertPage) + { + /* Skip update if insert page is lower than original insert page */ + /* This is needed to prevent insert from overwriting vacuum */ + if (!BlockNumberIsValid(originalInsertPage) || insertPage >= originalInsertPage) + { + list->insertPage = insertPage; + changed = true; + } + } + + if (BlockNumberIsValid(startPage) && startPage != list->startPage) + { + list->startPage = startPage; + changed = true; + } + + /* Only commit if changed */ + if (changed) + IvfflatCommitBuffer(buf, state); + else + { + GenericXLogAbort(state); + UnlockReleaseBuffer(buf); + } +} diff --git a/contrib/pgvector/src/ivfvacuum.c b/contrib/pgvector/src/ivfvacuum.c new file mode 100644 index 0000000000000000000000000000000000000000..f9725f7a44db8048141a0a878274778c1b9ee9d9 --- /dev/null +++ b/contrib/pgvector/src/ivfvacuum.c @@ -0,0 +1,159 @@ +#include "postgres.h" + +#include "commands/vacuum.h" +#include "ivfflat.h" +#include "storage/bufmgr.h" + +/* + * Bulk delete tuples from the index + */ +IndexBulkDeleteResult * +ivfflatbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, + IndexBulkDeleteCallback callback, void *callback_state) +{ + Relation index = info->index; + Buffer cbuf; + Page cpage; + Buffer buf; + Page page; + IvfflatList list; + IndexTuple itup; + ItemPointer htup; + OffsetNumber deletable[MaxOffsetNumber]; + int ndeletable; + BlockNumber startPages[MaxOffsetNumber]; + BlockNumber nextblkno = IVFFLAT_HEAD_BLKNO; + BlockNumber searchPage; + BlockNumber insertPage; + GenericXLogState *state; + OffsetNumber coffno; + OffsetNumber cmaxoffno; + OffsetNumber offno; + OffsetNumber maxoffno; + ListInfo listInfo; + BufferAccessStrategy bas = GetAccessStrategy(BAS_BULKREAD); + + if (stats == NULL) + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + + /* Iterate over list pages */ + while (BlockNumberIsValid(nextblkno)) + { + cbuf = ReadBuffer(index, nextblkno); + LockBuffer(cbuf, BUFFER_LOCK_SHARE); + cpage = BufferGetPage(cbuf); + + cmaxoffno = PageGetMaxOffsetNumber(cpage); + + /* Iterate over lists */ + for (coffno = FirstOffsetNumber; coffno <= cmaxoffno; coffno = OffsetNumberNext(coffno)) + { + list = (IvfflatList) PageGetItem(cpage, PageGetItemId(cpage, coffno)); + startPages[coffno - FirstOffsetNumber] = list->startPage; + } + + listInfo.blkno = nextblkno; + nextblkno = IvfflatPageGetOpaque(cpage)->nextblkno; + + UnlockReleaseBuffer(cbuf); + + for (coffno = FirstOffsetNumber; coffno <= cmaxoffno; coffno = OffsetNumberNext(coffno)) + { + searchPage = startPages[coffno - FirstOffsetNumber]; + insertPage = InvalidBlockNumber; + + /* Iterate over entry pages */ + while (BlockNumberIsValid(searchPage)) + { + vacuum_delay_point(); + + buf = ReadBufferExtended(index, MAIN_FORKNUM, searchPage, RBM_NORMAL, bas); + + /* + * ambulkdelete cannot delete entries from pages that are + * pinned by other backends + * + * https://www.postgresql.org/docs/current/index-locking.html + */ + LockBufferForCleanup(buf); + + state = GenericXLogStart(index); + page = GenericXLogRegisterBuffer(state, buf, 0); + + maxoffno = PageGetMaxOffsetNumber(page); + ndeletable = 0; + + /* Find deleted tuples */ + for (offno = FirstOffsetNumber; offno <= maxoffno; offno = OffsetNumberNext(offno)) + { + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offno)); + htup = &(itup->t_tid); + + if (callback(htup, callback_state)) + { + deletable[ndeletable++] = offno; + stats->tuples_removed++; + } + else + stats->num_index_tuples++; + } + + /* Set to first free page */ + /* Must be set before searchPage is updated */ + if (!BlockNumberIsValid(insertPage) && ndeletable > 0) + insertPage = searchPage; + + searchPage = IvfflatPageGetOpaque(page)->nextblkno; + + if (ndeletable > 0) + { + /* Delete tuples */ + PageIndexMultiDelete(page, deletable, ndeletable); + MarkBufferDirty(buf); + GenericXLogFinish(state); + } + else + GenericXLogAbort(state); + + UnlockReleaseBuffer(buf); + } + + /* + * Update after all tuples deleted. + * + * We don't add or delete items from lists pages, so offset won't + * change. + */ + if (BlockNumberIsValid(insertPage)) + { + listInfo.offno = coffno; + IvfflatUpdateList(index, state, listInfo, insertPage, InvalidBlockNumber, InvalidBlockNumber, MAIN_FORKNUM); + } + } + } + + FreeAccessStrategy(bas); + + return stats; +} + +/* + * Clean up after a VACUUM operation + */ +IndexBulkDeleteResult * +ivfflatvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) +{ + Relation rel = info->index; + + if (info->analyze_only) + return stats; + + /* stats is NULL if ambulkdelete not called */ + /* OK to return NULL if index not changed */ + if (stats == NULL) + return NULL; + + stats->num_pages = RelationGetNumberOfBlocks(rel); + + return stats; +} diff --git a/contrib/pgvector/src/vector.c b/contrib/pgvector/src/vector.c new file mode 100644 index 0000000000000000000000000000000000000000..394a478542c82f535904012050ac7c5cec3fd574 --- /dev/null +++ b/contrib/pgvector/src/vector.c @@ -0,0 +1,1021 @@ +#include "postgres.h" + +#include + +#include "vector.h" +#include "fmgr.h" +#include "catalog/pg_type.h" +#include "lib/stringinfo.h" +#include "libpq/pqformat.h" +#include "utils/array.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/numeric.h" + +#if PG_VERSION_NUM >= 120000 +#include "common/shortest_dec.h" +#include "utils/float.h" +#else +#include +#endif + +#if PG_VERSION_NUM < 130000 +#define TYPALIGN_DOUBLE 'd' +#define TYPALIGN_INT 'i' +#endif + +#define STATE_DIMS(x) (ARR_DIMS(x)[0] - 1) +#define CreateStateDatums(dim) palloc(sizeof(Datum) * (dim + 1)) + +PG_MODULE_MAGIC; + +/* + * Ensure same dimensions + */ +static inline void +CheckDims(Vector * a, Vector * b) +{ + if (a->dim != b->dim) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("different vector dimensions %d and %d", a->dim, b->dim))); +} + +/* + * Ensure expected dimensions + */ +static inline void +CheckExpectedDim(int32 typmod, int dim) +{ + if (typmod != -1 && typmod != dim) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("expected %d dimensions, not %d", typmod, dim))); +} + +/* + * Ensure valid dimensions + */ +static inline void +CheckDim(int dim) +{ + if (dim < 1) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("vector must have at least 1 dimension"))); + + if (dim > VECTOR_MAX_DIM) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("vector cannot have more than %d dimensions", VECTOR_MAX_DIM))); +} + +/* + * Ensure finite elements + */ +static inline void +CheckElement(float value) +{ + if (isnan(value)) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("NaN not allowed in vector"))); + + if (isinf(value)) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("infinite value not allowed in vector"))); +} + +/* + * Check for whitespace, since array_isspace() is static + */ +static inline bool +vector_isspace(char ch) +{ + if (ch == ' ' || + ch == '\t' || + ch == '\n' || + ch == '\r' || + ch == '\v' || + ch == '\f') + return true; + return false; +} + +/* + * Check state array + */ +static float8 * +CheckStateArray(ArrayType *statearray, const char *caller) +{ + if (ARR_NDIM(statearray) != 1 || + ARR_DIMS(statearray)[0] < 1 || + ARR_HASNULL(statearray) || + ARR_ELEMTYPE(statearray) != FLOAT8OID) + elog(ERROR, "%s: expected state array", caller); + return (float8 *) ARR_DATA_PTR(statearray); +} + +#if PG_VERSION_NUM < 120003 +static pg_noinline void +float_overflow_error(void) +{ + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("value out of range: overflow"))); +} +#endif + +/* + * Convert textual representation to internal representation + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_in); +Datum +vector_in(PG_FUNCTION_ARGS) +{ + char *str = PG_GETARG_CSTRING(0); + int32 typmod = PG_GETARG_INT32(2); + int i; + float x[VECTOR_MAX_DIM]; + int dim = 0; + char *pt; + char *stringEnd; + Vector *result; + char *lit = pstrdup(str); + + while (vector_isspace(*str)) + str++; + + if (*str != '[') + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed vector literal: \"%s\"", lit), + errdetail("Vector contents must start with \"[\"."))); + + str++; + pt = strtok(str, ","); + stringEnd = pt; + + while (pt != NULL && *stringEnd != ']') + { + if (dim == VECTOR_MAX_DIM) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("vector cannot have more than %d dimensions", VECTOR_MAX_DIM))); + + while (vector_isspace(*pt)) + pt++; + + /* Check for empty string like float4in */ + if (*pt == '\0') + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type vector: \"%s\"", lit))); + + /* Use strtof like float4in to avoid a double-rounding problem */ + x[dim] = strtof(pt, &stringEnd); + CheckElement(x[dim]); + dim++; + + if (stringEnd == pt) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type vector: \"%s\"", lit))); + + while (vector_isspace(*stringEnd)) + stringEnd++; + + if (*stringEnd != '\0' && *stringEnd != ']') + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type vector: \"%s\"", lit))); + + pt = strtok(NULL, ","); + } + + if (stringEnd == NULL || *stringEnd != ']') + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed vector literal: \"%s\"", lit), + errdetail("Unexpected end of input."))); + + stringEnd++; + + /* Only whitespace is allowed after the closing brace */ + while (vector_isspace(*stringEnd)) + stringEnd++; + + if (*stringEnd != '\0') + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed vector literal: \"%s\"", lit), + errdetail("Junk after closing right brace."))); + + /* Ensure no consecutive delimiters since strtok skips */ + for (pt = lit + 1; *pt != '\0'; pt++) + { + if (pt[-1] == ',' && *pt == ',') + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed vector literal: \"%s\"", lit))); + } + + if (dim < 1) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("vector must have at least 1 dimension"))); + + pfree(lit); + + CheckExpectedDim(typmod, dim); + + result = InitVector(dim); + for (i = 0; i < dim; i++) + result->x[i] = x[i]; + + PG_RETURN_POINTER(result); +} + +/* + * Convert internal representation to textual representation + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_out); +Datum +vector_out(PG_FUNCTION_ARGS) +{ + Vector *vector = PG_GETARG_VECTOR_P(0); + int dim = vector->dim; + char *buf; + char *ptr; + int i; + int n; + +#if PG_VERSION_NUM < 120000 + int ndig = FLT_DIG + extra_float_digits; + + if (ndig < 1) + ndig = 1; + +#define FLOAT_SHORTEST_DECIMAL_LEN (ndig + 10) +#endif + + /* + * Need: + * + * dim * (FLOAT_SHORTEST_DECIMAL_LEN - 1) bytes for + * float_to_shortest_decimal_bufn + * + * dim - 1 bytes for separator + * + * 3 bytes for [, ], and \0 + */ + buf = (char *) palloc(FLOAT_SHORTEST_DECIMAL_LEN * dim + 2); + ptr = buf; + + *ptr = '['; + ptr++; + for (i = 0; i < dim; i++) + { + if (i > 0) + { + *ptr = ','; + ptr++; + } + +#if PG_VERSION_NUM >= 120000 + n = float_to_shortest_decimal_bufn(vector->x[i], ptr); +#else + n = sprintf(ptr, "%.*g", ndig, vector->x[i]); +#endif + ptr += n; + } + *ptr = ']'; + ptr++; + *ptr = '\0'; + + PG_FREE_IF_COPY(vector, 0); + PG_RETURN_CSTRING(buf); +} + +/* + * Print vector - useful for debugging + */ +void +PrintVector(char *msg, Vector * vector) +{ + char *out = DatumGetPointer(DirectFunctionCall1(vector_out, PointerGetDatum(vector))); + + elog(INFO, "%s = %s", msg, out); + pfree(out); +} + +/* + * Convert type modifier + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_typmod_in); +Datum +vector_typmod_in(PG_FUNCTION_ARGS) +{ + ArrayType *ta = PG_GETARG_ARRAYTYPE_P(0); + int32 *tl; + int n; + + tl = ArrayGetIntegerTypmods(ta, &n); + + if (n != 1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid type modifier"))); + + if (*tl < 1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("dimensions for type vector must be at least 1"))); + + if (*tl > VECTOR_MAX_DIM) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("dimensions for type vector cannot exceed %d", VECTOR_MAX_DIM))); + + PG_RETURN_INT32(*tl); +} + +/* + * Convert external binary representation to internal representation + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_recv); +Datum +vector_recv(PG_FUNCTION_ARGS) +{ + StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); + int32 typmod = PG_GETARG_INT32(2); + Vector *result; + int16 dim; + int16 unused; + int i; + + dim = pq_getmsgint(buf, sizeof(int16)); + unused = pq_getmsgint(buf, sizeof(int16)); + + CheckDim(dim); + CheckExpectedDim(typmod, dim); + + if (unused != 0) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("expected unused to be 0, not %d", unused))); + + result = InitVector(dim); + for (i = 0; i < dim; i++) + { + result->x[i] = pq_getmsgfloat4(buf); + CheckElement(result->x[i]); + } + + PG_RETURN_POINTER(result); +} + +/* + * Convert internal representation to the external binary representation + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_send); +Datum +vector_send(PG_FUNCTION_ARGS) +{ + Vector *vec = PG_GETARG_VECTOR_P(0); + StringInfoData buf; + int i; + + pq_begintypsend(&buf); + pq_sendint(&buf, vec->dim, sizeof(int16)); + pq_sendint(&buf, vec->unused, sizeof(int16)); + for (i = 0; i < vec->dim; i++) + pq_sendfloat4(&buf, vec->x[i]); + + PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); +} + +/* + * Convert vector to vector + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(vector); +Datum +vector(PG_FUNCTION_ARGS) +{ + Vector *arg = PG_GETARG_VECTOR_P(0); + int32 typmod = PG_GETARG_INT32(1); + + CheckExpectedDim(typmod, arg->dim); + + PG_RETURN_POINTER(arg); +} + +/* + * Convert array to vector + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(array_to_vector); +Datum +array_to_vector(PG_FUNCTION_ARGS) +{ + ArrayType *array = PG_GETARG_ARRAYTYPE_P(0); + int32 typmod = PG_GETARG_INT32(1); + int i; + Vector *result; + int16 typlen; + bool typbyval; + char typalign; + Datum *elemsp; + bool *nullsp; + int nelemsp; + + if (ARR_NDIM(array) > 1) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("array must be 1-D"))); + + get_typlenbyvalalign(ARR_ELEMTYPE(array), &typlen, &typbyval, &typalign); + deconstruct_array(array, ARR_ELEMTYPE(array), typlen, typbyval, typalign, &elemsp, &nullsp, &nelemsp); + + CheckDim(nelemsp); + CheckExpectedDim(typmod, nelemsp); + + result = InitVector(nelemsp); + for (i = 0; i < nelemsp; i++) + { + if (nullsp[i]) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("array must not containing NULLs"))); + + /* TODO Move outside loop in 0.5.0 */ + if (ARR_ELEMTYPE(array) == INT4OID) + result->x[i] = DatumGetInt32(elemsp[i]); + else if (ARR_ELEMTYPE(array) == FLOAT8OID) + result->x[i] = DatumGetFloat8(elemsp[i]); + else if (ARR_ELEMTYPE(array) == FLOAT4OID) + result->x[i] = DatumGetFloat4(elemsp[i]); + else if (ARR_ELEMTYPE(array) == NUMERICOID) + result->x[i] = DatumGetFloat4(DirectFunctionCall1(numeric_float4, elemsp[i])); + else + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("unsupported array type"))); + + CheckElement(result->x[i]); + } + + PG_RETURN_POINTER(result); +} + +/* + * Convert vector to float4[] + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_to_float4); +Datum +vector_to_float4(PG_FUNCTION_ARGS) +{ + Vector *vec = PG_GETARG_VECTOR_P(0); + Datum *datums; + ArrayType *result; + int i; + + datums = (Datum *) palloc(sizeof(Datum) * vec->dim); + + for (i = 0; i < vec->dim; i++) + datums[i] = Float4GetDatum(vec->x[i]); + + /* Use TYPALIGN_INT for float4 */ + result = construct_array(datums, vec->dim, FLOAT4OID, sizeof(float4), true, TYPALIGN_INT); + + pfree(datums); + + PG_RETURN_POINTER(result); +} + +/* + * Get the L2 distance between vectors + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(l2_distance); +Datum +l2_distance(PG_FUNCTION_ARGS) +{ + Vector *a = PG_GETARG_VECTOR_P(0); + Vector *b = PG_GETARG_VECTOR_P(1); + float *ax = a->x; + float *bx = b->x; + double distance = 0.0; + double diff; + + CheckDims(a, b); + + /* Auto-vectorized */ + for (int i = 0; i < a->dim; i++) + { + diff = ax[i] - bx[i]; + distance += diff * diff; + } + + PG_RETURN_FLOAT8(sqrt(distance)); +} + +/* + * Get the L2 squared distance between vectors + * This saves a sqrt calculation + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_l2_squared_distance); +Datum +vector_l2_squared_distance(PG_FUNCTION_ARGS) +{ + Vector *a = PG_GETARG_VECTOR_P(0); + Vector *b = PG_GETARG_VECTOR_P(1); + float *ax = a->x; + float *bx = b->x; + double distance = 0.0; + double diff; + + CheckDims(a, b); + + /* Auto-vectorized */ + for (int i = 0; i < a->dim; i++) + { + diff = ax[i] - bx[i]; + distance += diff * diff; + } + + PG_RETURN_FLOAT8(distance); +} + +/* + * Get the inner product of two vectors + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(inner_product); +Datum +inner_product(PG_FUNCTION_ARGS) +{ + Vector *a = PG_GETARG_VECTOR_P(0); + Vector *b = PG_GETARG_VECTOR_P(1); + float *ax = a->x; + float *bx = b->x; + double distance = 0.0; + + CheckDims(a, b); + + /* Auto-vectorized */ + for (int i = 0; i < a->dim; i++) + distance += ax[i] * bx[i]; + + PG_RETURN_FLOAT8(distance); +} + +/* + * Get the negative inner product of two vectors + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_negative_inner_product); +Datum +vector_negative_inner_product(PG_FUNCTION_ARGS) +{ + Vector *a = PG_GETARG_VECTOR_P(0); + Vector *b = PG_GETARG_VECTOR_P(1); + float *ax = a->x; + float *bx = b->x; + double distance = 0.0; + + CheckDims(a, b); + + /* Auto-vectorized */ + for (int i = 0; i < a->dim; i++) + distance += ax[i] * bx[i]; + + PG_RETURN_FLOAT8(distance * -1); +} + +/* + * Get the cosine distance between two vectors + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(cosine_distance); +Datum +cosine_distance(PG_FUNCTION_ARGS) +{ + Vector *a = PG_GETARG_VECTOR_P(0); + Vector *b = PG_GETARG_VECTOR_P(1); + float *ax = a->x; + float *bx = b->x; + double distance = 0.0; + double norma = 0.0; + double normb = 0.0; + + CheckDims(a, b); + + /* Auto-vectorized */ + for (int i = 0; i < a->dim; i++) + { + distance += ax[i] * bx[i]; + norma += ax[i] * ax[i]; + normb += bx[i] * bx[i]; + } + + /* Use sqrt(a * b) over sqrt(a) * sqrt(b) */ + PG_RETURN_FLOAT8(1 - (distance / sqrt(norma * normb))); +} + +/* + * Get the distance for spherical k-means + * Currently uses angular distance since needs to satisfy triangle inequality + * Assumes inputs are unit vectors (skips norm) + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_spherical_distance); +Datum +vector_spherical_distance(PG_FUNCTION_ARGS) +{ + Vector *a = PG_GETARG_VECTOR_P(0); + Vector *b = PG_GETARG_VECTOR_P(1); + double distance = 0.0; + + CheckDims(a, b); + + /* Auto-vectorized */ + for (int i = 0; i < a->dim; i++) + distance += a->x[i] * b->x[i]; + + /* Prevent NaN with acos with loss of precision */ + if (distance > 1) + distance = 1; + else if (distance < -1) + distance = -1; + + PG_RETURN_FLOAT8(acos(distance) / M_PI); +} + +/* + * Get the dimensions of a vector + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_dims); +Datum +vector_dims(PG_FUNCTION_ARGS) +{ + Vector *a = PG_GETARG_VECTOR_P(0); + + PG_RETURN_INT32(a->dim); +} + +/* + * Get the L2 norm of a vector + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_norm); +Datum +vector_norm(PG_FUNCTION_ARGS) +{ + Vector *a = PG_GETARG_VECTOR_P(0); + float *ax = a->x; + double norm = 0.0; + + /* Auto-vectorized */ + for (int i = 0; i < a->dim; i++) + norm += ax[i] * ax[i]; + + PG_RETURN_FLOAT8(sqrt(norm)); +} + +/* + * Add vectors + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_add); +Datum +vector_add(PG_FUNCTION_ARGS) +{ + Vector *a = PG_GETARG_VECTOR_P(0); + Vector *b = PG_GETARG_VECTOR_P(1); + float *ax = a->x; + float *bx = b->x; + Vector *result; + float *rx; + + CheckDims(a, b); + + result = InitVector(a->dim); + rx = result->x; + + /* Auto-vectorized */ + for (int i = 0, imax = a->dim; i < imax; i++) + rx[i] = ax[i] + bx[i]; + + /* Check for overflow */ + for (int i = 0, imax = a->dim; i < imax; i++) + { + if (isinf(rx[i])) + float_overflow_error(); + } + + PG_RETURN_POINTER(result); +} + +/* + * Subtract vectors + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_sub); +Datum +vector_sub(PG_FUNCTION_ARGS) +{ + Vector *a = PG_GETARG_VECTOR_P(0); + Vector *b = PG_GETARG_VECTOR_P(1); + float *ax = a->x; + float *bx = b->x; + Vector *result; + float *rx; + + CheckDims(a, b); + + result = InitVector(a->dim); + rx = result->x; + + /* Auto-vectorized */ + for (int i = 0, imax = a->dim; i < imax; i++) + rx[i] = ax[i] - bx[i]; + + /* Check for overflow */ + for (int i = 0, imax = a->dim; i < imax; i++) + { + if (isinf(rx[i])) + float_overflow_error(); + } + + PG_RETURN_POINTER(result); +} + +/* + * Internal helper to compare vectors + */ +int +vector_cmp_internal(Vector * a, Vector * b) +{ + int i; + + CheckDims(a, b); + + for (i = 0; i < a->dim; i++) + { + if (a->x[i] < b->x[i]) + return -1; + + if (a->x[i] > b->x[i]) + return 1; + } + return 0; +} + +/* + * Less than + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_lt); +Datum +vector_lt(PG_FUNCTION_ARGS) +{ + Vector *a = (Vector *) PG_GETARG_VECTOR_P(0); + Vector *b = (Vector *) PG_GETARG_VECTOR_P(1); + + PG_RETURN_BOOL(vector_cmp_internal(a, b) < 0); +} + +/* + * Less than or equal + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_le); +Datum +vector_le(PG_FUNCTION_ARGS) +{ + Vector *a = (Vector *) PG_GETARG_VECTOR_P(0); + Vector *b = (Vector *) PG_GETARG_VECTOR_P(1); + + PG_RETURN_BOOL(vector_cmp_internal(a, b) <= 0); +} + +/* + * Equal + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_eq); +Datum +vector_eq(PG_FUNCTION_ARGS) +{ + Vector *a = (Vector *) PG_GETARG_VECTOR_P(0); + Vector *b = (Vector *) PG_GETARG_VECTOR_P(1); + + PG_RETURN_BOOL(vector_cmp_internal(a, b) == 0); +} + +/* + * Not equal + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_ne); +Datum +vector_ne(PG_FUNCTION_ARGS) +{ + Vector *a = (Vector *) PG_GETARG_VECTOR_P(0); + Vector *b = (Vector *) PG_GETARG_VECTOR_P(1); + + PG_RETURN_BOOL(vector_cmp_internal(a, b) != 0); +} + +/* + * Greater than or equal + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_ge); +Datum +vector_ge(PG_FUNCTION_ARGS) +{ + Vector *a = (Vector *) PG_GETARG_VECTOR_P(0); + Vector *b = (Vector *) PG_GETARG_VECTOR_P(1); + + PG_RETURN_BOOL(vector_cmp_internal(a, b) >= 0); +} + +/* + * Greater than + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_gt); +Datum +vector_gt(PG_FUNCTION_ARGS) +{ + Vector *a = (Vector *) PG_GETARG_VECTOR_P(0); + Vector *b = (Vector *) PG_GETARG_VECTOR_P(1); + + PG_RETURN_BOOL(vector_cmp_internal(a, b) > 0); +} + +/* + * Compare vectors + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_cmp); +Datum +vector_cmp(PG_FUNCTION_ARGS) +{ + Vector *a = (Vector *) PG_GETARG_VECTOR_P(0); + Vector *b = (Vector *) PG_GETARG_VECTOR_P(1); + + PG_RETURN_INT32(vector_cmp_internal(a, b)); +} + +/* + * Accumulate vectors + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_accum); +Datum +vector_accum(PG_FUNCTION_ARGS) +{ + ArrayType *statearray = PG_GETARG_ARRAYTYPE_P(0); + Vector *newval = PG_GETARG_VECTOR_P(1); + float8 *statevalues; + int16 dim; + bool newarr; + float8 n; + Datum *statedatums; + float *x = newval->x; + ArrayType *result; + + /* Check array before using */ + statevalues = CheckStateArray(statearray, "vector_accum"); + dim = STATE_DIMS(statearray); + newarr = dim == 0; + + if (newarr) + dim = newval->dim; + else + CheckExpectedDim(dim, newval->dim); + + n = statevalues[0] + 1.0; + + statedatums = CreateStateDatums(dim); + statedatums[0] = Float8GetDatum(n); + + if (newarr) + { + for (int i = 0; i < dim; i++) + statedatums[i + 1] = Float8GetDatum((double) x[i]); + } + else + { + for (int i = 0; i < dim; i++) + { + double v = statevalues[i + 1] + x[i]; + + /* Check for overflow */ + if (isinf(v)) + float_overflow_error(); + + statedatums[i + 1] = Float8GetDatum(v); + } + } + + /* Use float8 array like float4_accum */ + result = construct_array(statedatums, dim + 1, + FLOAT8OID, + sizeof(float8), FLOAT8PASSBYVAL, TYPALIGN_DOUBLE); + + pfree(statedatums); + + PG_RETURN_ARRAYTYPE_P(result); +} + +/* + * Combine vectors + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_combine); +Datum +vector_combine(PG_FUNCTION_ARGS) +{ + ArrayType *statearray1 = PG_GETARG_ARRAYTYPE_P(0); + ArrayType *statearray2 = PG_GETARG_ARRAYTYPE_P(1); + float8 *statevalues1; + float8 *statevalues2; + float8 n; + float8 n1; + float8 n2; + int16 dim; + Datum *statedatums; + ArrayType *result; + + /* Check arrays before using */ + statevalues1 = CheckStateArray(statearray1, "vector_combine"); + statevalues2 = CheckStateArray(statearray2, "vector_combine"); + + n1 = statevalues1[0]; + n2 = statevalues2[0]; + + if (n1 == 0.0) + { + n = n2; + dim = STATE_DIMS(statearray2); + statedatums = CreateStateDatums(dim); + for (int i = 1; i <= dim; i++) + statedatums[i] = Float8GetDatum(statevalues2[i]); + } + else if (n2 == 0.0) + { + n = n1; + dim = STATE_DIMS(statearray1); + statedatums = CreateStateDatums(dim); + for (int i = 1; i <= dim; i++) + statedatums[i] = Float8GetDatum(statevalues1[i]); + } + else + { + n = n1 + n2; + dim = STATE_DIMS(statearray1); + CheckExpectedDim(dim, STATE_DIMS(statearray2)); + statedatums = CreateStateDatums(dim); + for (int i = 1; i <= dim; i++) + { + double v = statevalues1[i] + statevalues2[i]; + + /* Check for overflow */ + if (isinf(v)) + float_overflow_error(); + + statedatums[i] = Float8GetDatum(v); + } + } + + statedatums[0] = Float8GetDatum(n); + + result = construct_array(statedatums, dim + 1, + FLOAT8OID, + sizeof(float8), FLOAT8PASSBYVAL, TYPALIGN_DOUBLE); + + pfree(statedatums); + + PG_RETURN_ARRAYTYPE_P(result); +} + +/* + * Average vectors + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_avg); +Datum +vector_avg(PG_FUNCTION_ARGS) +{ + ArrayType *statearray = PG_GETARG_ARRAYTYPE_P(0); + float8 *statevalues; + float8 n; + uint16 dim; + Vector *result; + + /* Check array before using */ + statevalues = CheckStateArray(statearray, "vector_avg"); + n = statevalues[0]; + + /* SQL defines AVG of no values to be NULL */ + if (n == 0.0) + PG_RETURN_NULL(); + + /* Create vector */ + dim = STATE_DIMS(statearray); + CheckDim(dim); + result = InitVector(dim); + for (int i = 0; i < dim; i++) + { + result->x[i] = statevalues[i + 1] / n; + CheckElement(result->x[i]); + } + + PG_RETURN_POINTER(result); +} diff --git a/contrib/pgvector/src/vector.h b/contrib/pgvector/src/vector.h new file mode 100644 index 0000000000000000000000000000000000000000..93aeb6a6be1994fe3b12a3db3372592fbffb368f --- /dev/null +++ b/contrib/pgvector/src/vector.h @@ -0,0 +1,45 @@ +#ifndef VECTOR_H +#define VECTOR_H + +#include "postgres.h" + +#if PG_VERSION_NUM >= 160000 +#include "varatt.h" +#endif + +#define VECTOR_MAX_DIM 16000 + +#define VECTOR_SIZE(_dim) (offsetof(Vector, x) + sizeof(float)*(_dim)) +#define DatumGetVector(x) ((Vector *) PG_DETOAST_DATUM(x)) +#define PG_GETARG_VECTOR_P(x) DatumGetVector(PG_GETARG_DATUM(x)) +#define PG_RETURN_VECTOR_P(x) PG_RETURN_POINTER(x) + +typedef struct Vector +{ + int32 vl_len_; /* varlena header (do not touch directly!) */ + int16 dim; /* number of dimensions */ + int16 unused; + float x[FLEXIBLE_ARRAY_MEMBER]; +} Vector; + +void PrintVector(char *msg, Vector * vector); +int vector_cmp_internal(Vector * a, Vector * b); + +/* + * Allocate and initialize a new vector + */ +static inline Vector * +InitVector(int dim) +{ + Vector *result; + int size; + + size = VECTOR_SIZE(dim); + result = (Vector *) palloc0(size); + SET_VARSIZE(result, size); + result->dim = dim; + + return result; +} + +#endif diff --git a/contrib/pgvector/test/expected/btree.out b/contrib/pgvector/test/expected/btree.out new file mode 100644 index 0000000000000000000000000000000000000000..d8b6da5690e99595fb66fb2156bd09c0f2e929f5 --- /dev/null +++ b/contrib/pgvector/test/expected/btree.out @@ -0,0 +1,17 @@ +SET enable_seqscan = off; +CREATE TABLE t (val vector(3)); +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); +CREATE INDEX ON t (val); +SELECT * FROM t WHERE val = '[1,2,3]'; + val +--------- + [1,2,3] +(1 row) + +SELECT * FROM t ORDER BY val LIMIT 1; + val +--------- + [0,0,0] +(1 row) + +DROP TABLE t; diff --git a/contrib/pgvector/test/expected/cast.out b/contrib/pgvector/test/expected/cast.out new file mode 100644 index 0000000000000000000000000000000000000000..37614d9338aadf029c710d208109afba86a68271 --- /dev/null +++ b/contrib/pgvector/test/expected/cast.out @@ -0,0 +1,57 @@ +SELECT ARRAY[1,2,3]::vector; + array +--------- + [1,2,3] +(1 row) + +SELECT ARRAY[1.0,2.0,3.0]::vector; + array +--------- + [1,2,3] +(1 row) + +SELECT ARRAY[1,2,3]::float4[]::vector; + array +--------- + [1,2,3] +(1 row) + +SELECT ARRAY[1,2,3]::float8[]::vector; + array +--------- + [1,2,3] +(1 row) + +SELECT ARRAY[1,2,3]::numeric[]::vector; + array +--------- + [1,2,3] +(1 row) + +SELECT '{NULL}'::real[]::vector; +ERROR: array must not containing NULLs +SELECT '{NaN}'::real[]::vector; +ERROR: NaN not allowed in vector +SELECT '{Infinity}'::real[]::vector; +ERROR: infinite value not allowed in vector +SELECT '{-Infinity}'::real[]::vector; +ERROR: infinite value not allowed in vector +SELECT '{}'::real[]::vector; +ERROR: vector must have at least 1 dimension +SELECT '[1,2,3]'::vector::real[]; + float4 +--------- + {1,2,3} +(1 row) + +SELECT array_agg(n)::vector FROM generate_series(1, 16001) n; +ERROR: vector cannot have more than 16000 dimensions +SELECT array_to_vector(array_agg(n), 16001, false) FROM generate_series(1, 16001) n; +ERROR: vector cannot have more than 16000 dimensions +-- ensure no error +SELECT ARRAY[1,2,3] = ARRAY[1,2,3]; + ?column? +---------- + t +(1 row) + diff --git a/contrib/pgvector/test/expected/copy.out b/contrib/pgvector/test/expected/copy.out new file mode 100644 index 0000000000000000000000000000000000000000..36d4620db31e231bf7754c40d77e8d1daf1c687d --- /dev/null +++ b/contrib/pgvector/test/expected/copy.out @@ -0,0 +1,16 @@ +CREATE TABLE t (val vector(3)); +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); +CREATE TABLE t2 (val vector(3)); +\copy t TO 'results/data.bin' WITH (FORMAT binary) +\copy t2 FROM 'results/data.bin' WITH (FORMAT binary) +SELECT * FROM t2 ORDER BY val; + val +--------- + [0,0,0] + [1,1,1] + [1,2,3] + +(4 rows) + +DROP TABLE t; +DROP TABLE t2; diff --git a/contrib/pgvector/test/expected/functions.out b/contrib/pgvector/test/expected/functions.out new file mode 100644 index 0000000000000000000000000000000000000000..0272282f2a450831cd283d50f5cca727ed68db42 --- /dev/null +++ b/contrib/pgvector/test/expected/functions.out @@ -0,0 +1,110 @@ +SELECT '[1,2,3]'::vector + '[4,5,6]'; + ?column? +---------- + [5,7,9] +(1 row) + +SELECT '[3e38]'::vector + '[3e38]'; +ERROR: value out of range: overflow +SELECT '[1,2,3]'::vector - '[4,5,6]'; + ?column? +------------ + [-3,-3,-3] +(1 row) + +SELECT '[-3e38]'::vector - '[3e38]'; +ERROR: value out of range: overflow +SELECT vector_dims('[1,2,3]'); + vector_dims +------------- + 3 +(1 row) + +SELECT round(vector_norm('[1,1]')::numeric, 5); + round +--------- + 1.41421 +(1 row) + +SELECT vector_norm('[3,4]'); + vector_norm +------------- + 5 +(1 row) + +SELECT vector_norm('[0,1]'); + vector_norm +------------- + 1 +(1 row) + +SELECT l2_distance('[0,0]', '[3,4]'); + l2_distance +------------- + 5 +(1 row) + +SELECT l2_distance('[0,0]', '[0,1]'); + l2_distance +------------- + 1 +(1 row) + +SELECT l2_distance('[1,2]', '[3]'); +ERROR: different vector dimensions 2 and 1 +SELECT inner_product('[1,2]', '[3,4]'); + inner_product +--------------- + 11 +(1 row) + +SELECT inner_product('[1,2]', '[3]'); +ERROR: different vector dimensions 2 and 1 +SELECT cosine_distance('[1,2]', '[2,4]'); + cosine_distance +----------------- + 0 +(1 row) + +SELECT cosine_distance('[1,2]', '[0,0]'); + cosine_distance +----------------- + NaN +(1 row) + +SELECT cosine_distance('[1,1]', '[1,1]'); + cosine_distance +----------------- + 0 +(1 row) + +SELECT cosine_distance('[1,1]', '[-1,-1]'); + cosine_distance +----------------- + 2 +(1 row) + +SELECT cosine_distance('[1,2]', '[3]'); +ERROR: different vector dimensions 2 and 1 +SELECT avg(v) FROM unnest(ARRAY['[1,2,3]'::vector, '[3,5,7]']) v; + avg +----------- + [2,3.5,5] +(1 row) + +SELECT avg(v) FROM unnest(ARRAY['[1,2,3]'::vector, '[3,5,7]', NULL]) v; + avg +----------- + [2,3.5,5] +(1 row) + +SELECT avg(v) FROM unnest(ARRAY[]::vector[]) v; + avg +----- + +(1 row) + +SELECT avg(v) FROM unnest(ARRAY['[1,2]'::vector, '[3]']) v; +ERROR: expected 2 dimensions, not 1 +SELECT vector_avg(array_agg(n)) FROM generate_series(1, 16002) n; +ERROR: vector cannot have more than 16000 dimensions diff --git a/contrib/pgvector/test/expected/input.out b/contrib/pgvector/test/expected/input.out new file mode 100644 index 0000000000000000000000000000000000000000..19ef74d2354eb8e5e39a194ffb9f6d55c751e87e --- /dev/null +++ b/contrib/pgvector/test/expected/input.out @@ -0,0 +1,124 @@ +SELECT '[1,2,3]'::vector; + vector +--------- + [1,2,3] +(1 row) + +SELECT '[-1,-2,-3]'::vector; + vector +------------ + [-1,-2,-3] +(1 row) + +SELECT '[1.,2.,3.]'::vector; + vector +--------- + [1,2,3] +(1 row) + +SELECT ' [ 1, 2 , 3 ] '::vector; + vector +--------- + [1,2,3] +(1 row) + +SELECT '[1.23456]'::vector; + vector +----------- + [1.23456] +(1 row) + +SELECT '[hello,1]'::vector; +ERROR: invalid input syntax for type vector: "[hello,1]" +LINE 1: SELECT '[hello,1]'::vector; + ^ +SELECT '[NaN,1]'::vector; +ERROR: NaN not allowed in vector +LINE 1: SELECT '[NaN,1]'::vector; + ^ +SELECT '[Infinity,1]'::vector; +ERROR: infinite value not allowed in vector +LINE 1: SELECT '[Infinity,1]'::vector; + ^ +SELECT '[-Infinity,1]'::vector; +ERROR: infinite value not allowed in vector +LINE 1: SELECT '[-Infinity,1]'::vector; + ^ +SELECT '[1.5e38,-1.5e38]'::vector; + vector +-------------------- + [1.5e+38,-1.5e+38] +(1 row) + +SELECT '[1.5e+38,-1.5e+38]'::vector; + vector +-------------------- + [1.5e+38,-1.5e+38] +(1 row) + +SELECT '[1.5e-38,-1.5e-38]'::vector; + vector +-------------------- + [1.5e-38,-1.5e-38] +(1 row) + +SELECT '[4e38,1]'::vector; +ERROR: infinite value not allowed in vector +LINE 1: SELECT '[4e38,1]'::vector; + ^ +SELECT '[1,2,3'::vector; +ERROR: malformed vector literal: "[1,2,3" +LINE 1: SELECT '[1,2,3'::vector; + ^ +DETAIL: Unexpected end of input. +SELECT '[1,2,3]9'::vector; +ERROR: malformed vector literal: "[1,2,3]9" +LINE 1: SELECT '[1,2,3]9'::vector; + ^ +DETAIL: Junk after closing right brace. +SELECT '1,2,3'::vector; +ERROR: malformed vector literal: "1,2,3" +LINE 1: SELECT '1,2,3'::vector; + ^ +DETAIL: Vector contents must start with "[". +SELECT '['::vector; +ERROR: malformed vector literal: "[" +LINE 1: SELECT '['::vector; + ^ +DETAIL: Unexpected end of input. +SELECT '[,'::vector; +ERROR: malformed vector literal: "[," +LINE 1: SELECT '[,'::vector; + ^ +DETAIL: Unexpected end of input. +SELECT '[]'::vector; +ERROR: vector must have at least 1 dimension +LINE 1: SELECT '[]'::vector; + ^ +SELECT '[1,]'::vector; +ERROR: invalid input syntax for type vector: "[1,]" +LINE 1: SELECT '[1,]'::vector; + ^ +SELECT '[1a]'::vector; +ERROR: invalid input syntax for type vector: "[1a]" +LINE 1: SELECT '[1a]'::vector; + ^ +SELECT '[1,,3]'::vector; +ERROR: malformed vector literal: "[1,,3]" +LINE 1: SELECT '[1,,3]'::vector; + ^ +SELECT '[1, ,3]'::vector; +ERROR: invalid input syntax for type vector: "[1, ,3]" +LINE 1: SELECT '[1, ,3]'::vector; + ^ +SELECT '[1,2,3]'::vector(2); +ERROR: expected 2 dimensions, not 3 +SELECT unnest('{"[1,2,3]", "[4,5,6]"}'::vector[]); + unnest +--------- + [1,2,3] + [4,5,6] +(2 rows) + +SELECT '{"[1,2,3]"}'::vector(2)[]; +ERROR: expected 2 dimensions, not 3 diff --git a/contrib/pgvector/test/expected/ivfflat_cosine.out b/contrib/pgvector/test/expected/ivfflat_cosine.out new file mode 100644 index 0000000000000000000000000000000000000000..96db5e0b42d36bf5d4cb93d2a96dca189aefba50 --- /dev/null +++ b/contrib/pgvector/test/expected/ivfflat_cosine.out @@ -0,0 +1,19 @@ +SET enable_seqscan = off; +CREATE TABLE t (val vector(3)); +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); +CREATE INDEX ON t USING ivfflat (val vector_cosine_ops) WITH (lists = 1); +INSERT INTO t (val) VALUES ('[1,2,4]'); +SELECT * FROM t ORDER BY val <=> '[3,3,3]'; + val +--------- + [1,1,1] + [1,2,3] + [1,2,4] +(3 rows) + +SELECT * FROM t ORDER BY val <=> (SELECT NULL::vector); + val +----- +(0 rows) + +DROP TABLE t; diff --git a/contrib/pgvector/test/expected/ivfflat_ip.out b/contrib/pgvector/test/expected/ivfflat_ip.out new file mode 100644 index 0000000000000000000000000000000000000000..d4fc5380952a0e9c9c96784239f85b8be7d3545a --- /dev/null +++ b/contrib/pgvector/test/expected/ivfflat_ip.out @@ -0,0 +1,20 @@ +SET enable_seqscan = off; +CREATE TABLE t (val vector(3)); +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); +CREATE INDEX ON t USING ivfflat (val vector_ip_ops) WITH (lists = 1); +INSERT INTO t (val) VALUES ('[1,2,4]'); +SELECT * FROM t ORDER BY val <#> '[3,3,3]'; + val +--------- + [1,2,4] + [1,2,3] + [1,1,1] + [0,0,0] +(4 rows) + +SELECT * FROM t ORDER BY val <#> (SELECT NULL::vector); + val +----- +(0 rows) + +DROP TABLE t; diff --git a/contrib/pgvector/test/expected/ivfflat_l2.out b/contrib/pgvector/test/expected/ivfflat_l2.out new file mode 100644 index 0000000000000000000000000000000000000000..2e8c6c2573a1b83e3766a6d302e9bce97b95ab6a --- /dev/null +++ b/contrib/pgvector/test/expected/ivfflat_l2.out @@ -0,0 +1,26 @@ +SET enable_seqscan = off; +CREATE TABLE t (val vector(3)); +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); +CREATE INDEX ON t USING ivfflat (val) WITH (lists = 1); +INSERT INTO t (val) VALUES ('[1,2,4]'); +SELECT * FROM t ORDER BY val <-> '[3,3,3]'; + val +--------- + [1,2,3] + [1,2,4] + [1,1,1] + [0,0,0] +(4 rows) + +SELECT * FROM t ORDER BY val <-> (SELECT NULL::vector); + val +----- +(0 rows) + +SELECT COUNT(*) FROM t; + count +------- + 5 +(1 row) + +DROP TABLE t; diff --git a/contrib/pgvector/test/expected/ivfflat_options.out b/contrib/pgvector/test/expected/ivfflat_options.out new file mode 100644 index 0000000000000000000000000000000000000000..405a75d3e85fe10e288e71d0b6c9843cbc7d62ec --- /dev/null +++ b/contrib/pgvector/test/expected/ivfflat_options.out @@ -0,0 +1,15 @@ +SET enable_seqscan = off; +CREATE TABLE t (val vector(3)); +CREATE INDEX ON t USING ivfflat (val) WITH (lists = 0); +ERROR: value 0 out of bounds for option "lists" +DETAIL: Valid values are between "1" and "32768". +CREATE INDEX ON t USING ivfflat (val) WITH (lists = 32769); +ERROR: value 32769 out of bounds for option "lists" +DETAIL: Valid values are between "1" and "32768". +SHOW ivfflat.probes; + ivfflat.probes +---------------- + 1 +(1 row) + +DROP TABLE t; diff --git a/contrib/pgvector/test/expected/ivfflat_unlogged.out b/contrib/pgvector/test/expected/ivfflat_unlogged.out new file mode 100644 index 0000000000000000000000000000000000000000..198ea97a2251f9d4e7ebdf0651e033ea7ae78490 --- /dev/null +++ b/contrib/pgvector/test/expected/ivfflat_unlogged.out @@ -0,0 +1,13 @@ +SET enable_seqscan = off; +CREATE UNLOGGED TABLE t (val vector(3)); +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); +CREATE INDEX ON t USING ivfflat (val) WITH (lists = 1); +SELECT * FROM t ORDER BY val <-> '[3,3,3]'; + val +--------- + [1,2,3] + [1,1,1] + [0,0,0] +(3 rows) + +DROP TABLE t; diff --git a/contrib/pgvector/test/perl/PostgresNode.pm b/contrib/pgvector/test/perl/PostgresNode.pm new file mode 100644 index 0000000000000000000000000000000000000000..32ffb95fa99d06607a31566edc4ec78c46272b78 --- /dev/null +++ b/contrib/pgvector/test/perl/PostgresNode.pm @@ -0,0 +1,8 @@ +use PostgreSQL::Test::Cluster; + +sub get_new_node +{ + return PostgreSQL::Test::Cluster->new(@_); +} + +1; diff --git a/contrib/pgvector/test/perl/TestLib.pm b/contrib/pgvector/test/perl/TestLib.pm new file mode 100644 index 0000000000000000000000000000000000000000..1cb2a826e3c015369b5ca9e6404e6f176ed52bd5 --- /dev/null +++ b/contrib/pgvector/test/perl/TestLib.pm @@ -0,0 +1,3 @@ +use PostgreSQL::Test::Utils; + +1; diff --git a/contrib/pgvector/test/sql/btree.sql b/contrib/pgvector/test/sql/btree.sql new file mode 100644 index 0000000000000000000000000000000000000000..232b28818116b025e3548d26dcc4db20d84a96c5 --- /dev/null +++ b/contrib/pgvector/test/sql/btree.sql @@ -0,0 +1,10 @@ +SET enable_seqscan = off; + +CREATE TABLE t (val vector(3)); +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); +CREATE INDEX ON t (val); + +SELECT * FROM t WHERE val = '[1,2,3]'; +SELECT * FROM t ORDER BY val LIMIT 1; + +DROP TABLE t; diff --git a/contrib/pgvector/test/sql/cast.sql b/contrib/pgvector/test/sql/cast.sql new file mode 100644 index 0000000000000000000000000000000000000000..cb5c88094d3de2fda5d528d8cbdcb0cb3e47114e --- /dev/null +++ b/contrib/pgvector/test/sql/cast.sql @@ -0,0 +1,16 @@ +SELECT ARRAY[1,2,3]::vector; +SELECT ARRAY[1.0,2.0,3.0]::vector; +SELECT ARRAY[1,2,3]::float4[]::vector; +SELECT ARRAY[1,2,3]::float8[]::vector; +SELECT ARRAY[1,2,3]::numeric[]::vector; +SELECT '{NULL}'::real[]::vector; +SELECT '{NaN}'::real[]::vector; +SELECT '{Infinity}'::real[]::vector; +SELECT '{-Infinity}'::real[]::vector; +SELECT '{}'::real[]::vector; +SELECT '[1,2,3]'::vector::real[]; +SELECT array_agg(n)::vector FROM generate_series(1, 16001) n; +SELECT array_to_vector(array_agg(n), 16001, false) FROM generate_series(1, 16001) n; + +-- ensure no error +SELECT ARRAY[1,2,3] = ARRAY[1,2,3]; diff --git a/contrib/pgvector/test/sql/copy.sql b/contrib/pgvector/test/sql/copy.sql new file mode 100644 index 0000000000000000000000000000000000000000..28200901c572a6d40913b87ac3c2b12417e2bdc6 --- /dev/null +++ b/contrib/pgvector/test/sql/copy.sql @@ -0,0 +1,12 @@ +CREATE TABLE t (val vector(3)); +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); + +CREATE TABLE t2 (val vector(3)); + +\copy t TO 'results/data.bin' WITH (FORMAT binary) +\copy t2 FROM 'results/data.bin' WITH (FORMAT binary) + +SELECT * FROM t2 ORDER BY val; + +DROP TABLE t; +DROP TABLE t2; diff --git a/contrib/pgvector/test/sql/functions.sql b/contrib/pgvector/test/sql/functions.sql new file mode 100644 index 0000000000000000000000000000000000000000..e4d33172f3169f31062beb424ef22c52cb97bcd0 --- /dev/null +++ b/contrib/pgvector/test/sql/functions.sql @@ -0,0 +1,29 @@ +SELECT '[1,2,3]'::vector + '[4,5,6]'; +SELECT '[3e38]'::vector + '[3e38]'; +SELECT '[1,2,3]'::vector - '[4,5,6]'; +SELECT '[-3e38]'::vector - '[3e38]'; + +SELECT vector_dims('[1,2,3]'); + +SELECT round(vector_norm('[1,1]')::numeric, 5); +SELECT vector_norm('[3,4]'); +SELECT vector_norm('[0,1]'); + +SELECT l2_distance('[0,0]', '[3,4]'); +SELECT l2_distance('[0,0]', '[0,1]'); +SELECT l2_distance('[1,2]', '[3]'); + +SELECT inner_product('[1,2]', '[3,4]'); +SELECT inner_product('[1,2]', '[3]'); + +SELECT cosine_distance('[1,2]', '[2,4]'); +SELECT cosine_distance('[1,2]', '[0,0]'); +SELECT cosine_distance('[1,1]', '[1,1]'); +SELECT cosine_distance('[1,1]', '[-1,-1]'); +SELECT cosine_distance('[1,2]', '[3]'); + +SELECT avg(v) FROM unnest(ARRAY['[1,2,3]'::vector, '[3,5,7]']) v; +SELECT avg(v) FROM unnest(ARRAY['[1,2,3]'::vector, '[3,5,7]', NULL]) v; +SELECT avg(v) FROM unnest(ARRAY[]::vector[]) v; +SELECT avg(v) FROM unnest(ARRAY['[1,2]'::vector, '[3]']) v; +SELECT vector_avg(array_agg(n)) FROM generate_series(1, 16002) n; diff --git a/contrib/pgvector/test/sql/input.sql b/contrib/pgvector/test/sql/input.sql new file mode 100644 index 0000000000000000000000000000000000000000..a4ad08d8bb66ad71b0bb8d8e7cff6ab7e12397a0 --- /dev/null +++ b/contrib/pgvector/test/sql/input.sql @@ -0,0 +1,27 @@ +SELECT '[1,2,3]'::vector; +SELECT '[-1,-2,-3]'::vector; +SELECT '[1.,2.,3.]'::vector; +SELECT ' [ 1, 2 , 3 ] '::vector; +SELECT '[1.23456]'::vector; +SELECT '[hello,1]'::vector; +SELECT '[NaN,1]'::vector; +SELECT '[Infinity,1]'::vector; +SELECT '[-Infinity,1]'::vector; +SELECT '[1.5e38,-1.5e38]'::vector; +SELECT '[1.5e+38,-1.5e+38]'::vector; +SELECT '[1.5e-38,-1.5e-38]'::vector; +SELECT '[4e38,1]'::vector; +SELECT '[1,2,3'::vector; +SELECT '[1,2,3]9'::vector; +SELECT '1,2,3'::vector; +SELECT '['::vector; +SELECT '[,'::vector; +SELECT '[]'::vector; +SELECT '[1,]'::vector; +SELECT '[1a]'::vector; +SELECT '[1,,3]'::vector; +SELECT '[1, ,3]'::vector; +SELECT '[1,2,3]'::vector(2); + +SELECT unnest('{"[1,2,3]", "[4,5,6]"}'::vector[]); +SELECT '{"[1,2,3]"}'::vector(2)[]; diff --git a/contrib/pgvector/test/sql/ivfflat_cosine.sql b/contrib/pgvector/test/sql/ivfflat_cosine.sql new file mode 100644 index 0000000000000000000000000000000000000000..1fec6cfd9ee9d43193f43b88969825ea43f8821c --- /dev/null +++ b/contrib/pgvector/test/sql/ivfflat_cosine.sql @@ -0,0 +1,12 @@ +SET enable_seqscan = off; + +CREATE TABLE t (val vector(3)); +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); +CREATE INDEX ON t USING ivfflat (val vector_cosine_ops) WITH (lists = 1); + +INSERT INTO t (val) VALUES ('[1,2,4]'); + +SELECT * FROM t ORDER BY val <=> '[3,3,3]'; +SELECT * FROM t ORDER BY val <=> (SELECT NULL::vector); + +DROP TABLE t; diff --git a/contrib/pgvector/test/sql/ivfflat_ip.sql b/contrib/pgvector/test/sql/ivfflat_ip.sql new file mode 100644 index 0000000000000000000000000000000000000000..46daa4e5caef0138be21dfc921533ee53cabde9c --- /dev/null +++ b/contrib/pgvector/test/sql/ivfflat_ip.sql @@ -0,0 +1,12 @@ +SET enable_seqscan = off; + +CREATE TABLE t (val vector(3)); +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); +CREATE INDEX ON t USING ivfflat (val vector_ip_ops) WITH (lists = 1); + +INSERT INTO t (val) VALUES ('[1,2,4]'); + +SELECT * FROM t ORDER BY val <#> '[3,3,3]'; +SELECT * FROM t ORDER BY val <#> (SELECT NULL::vector); + +DROP TABLE t; diff --git a/contrib/pgvector/test/sql/ivfflat_l2.sql b/contrib/pgvector/test/sql/ivfflat_l2.sql new file mode 100644 index 0000000000000000000000000000000000000000..9349572ab84fe887c865bc08b18c59a18e8976bc --- /dev/null +++ b/contrib/pgvector/test/sql/ivfflat_l2.sql @@ -0,0 +1,13 @@ +SET enable_seqscan = off; + +CREATE TABLE t (val vector(3)); +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); +CREATE INDEX ON t USING ivfflat (val) WITH (lists = 1); + +INSERT INTO t (val) VALUES ('[1,2,4]'); + +SELECT * FROM t ORDER BY val <-> '[3,3,3]'; +SELECT * FROM t ORDER BY val <-> (SELECT NULL::vector); +SELECT COUNT(*) FROM t; + +DROP TABLE t; diff --git a/contrib/pgvector/test/sql/ivfflat_options.sql b/contrib/pgvector/test/sql/ivfflat_options.sql new file mode 100644 index 0000000000000000000000000000000000000000..d8dc45c611cdee91daafde70d1c0641ea3d80a63 --- /dev/null +++ b/contrib/pgvector/test/sql/ivfflat_options.sql @@ -0,0 +1,9 @@ +SET enable_seqscan = off; + +CREATE TABLE t (val vector(3)); +CREATE INDEX ON t USING ivfflat (val) WITH (lists = 0); +CREATE INDEX ON t USING ivfflat (val) WITH (lists = 32769); + +SHOW ivfflat.probes; + +DROP TABLE t; diff --git a/contrib/pgvector/test/sql/ivfflat_unlogged.sql b/contrib/pgvector/test/sql/ivfflat_unlogged.sql new file mode 100644 index 0000000000000000000000000000000000000000..ca4c6ba9ca676516b7bf4855735aa61f2b642351 --- /dev/null +++ b/contrib/pgvector/test/sql/ivfflat_unlogged.sql @@ -0,0 +1,9 @@ +SET enable_seqscan = off; + +CREATE UNLOGGED TABLE t (val vector(3)); +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); +CREATE INDEX ON t USING ivfflat (val) WITH (lists = 1); + +SELECT * FROM t ORDER BY val <-> '[3,3,3]'; + +DROP TABLE t; diff --git a/contrib/pgvector/test/t/001_wal.pl b/contrib/pgvector/test/t/001_wal.pl new file mode 100644 index 0000000000000000000000000000000000000000..46060ede65230e279ba2cdc7df1ebf043fadda80 --- /dev/null +++ b/contrib/pgvector/test/t/001_wal.pl @@ -0,0 +1,97 @@ +# Based on postgres/contrib/bloom/t/001_wal.pl + +# Test generic xlog record work for ivfflat index replication. +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More tests => 31; + +my $dim = 32; + +my $node_primary; +my $node_replica; + +# Run few queries on both primary and replica and check their results match. +sub test_index_replay +{ + my ($test_name) = @_; + + # Wait for replica to catch up + my $applname = $node_replica->name; + + my $server_version_num = $node_primary->safe_psql("postgres", "SHOW server_version_num"); + my $caughtup_query = "SELECT pg_current_wal_lsn() <= replay_lsn FROM pg_stat_replication WHERE application_name = '$applname';"; + $node_primary->poll_query_until('postgres', $caughtup_query) + or die "Timed out while waiting for replica 1 to catch up"; + + my @r = (); + for (1 .. $dim) { + push(@r, rand()); + } + my $sql = join(",", @r); + + my $queries = qq( + SET enable_seqscan = off; + SELECT * FROM tst ORDER BY v <-> '[$sql]' LIMIT 10; + ); + + # Run test queries and compare their result + my $primary_result = $node_primary->safe_psql("postgres", $queries); + my $replica_result = $node_replica->safe_psql("postgres", $queries); + + is($primary_result, $replica_result, "$test_name: query result matches"); + return; +} + +# Use ARRAY[random(), random(), random(), ...] over +# SELECT array_agg(random()) FROM generate_series(1, $dim) +# to generate different values for each row +my $array_sql = join(",", ('random()') x $dim); + +# Initialize primary node +$node_primary = get_new_node('primary'); +$node_primary->init(allows_streaming => 1); +if ($dim > 32) { + # TODO use wal_keep_segments for Postgres < 13 + $node_primary->append_conf('postgresql.conf', qq(wal_keep_size = 1GB)); +} +if ($dim > 1500) { + $node_primary->append_conf('postgresql.conf', qq(maintenance_work_mem = 128MB)); +} +$node_primary->start; +my $backup_name = 'my_backup'; + +# Take backup +$node_primary->backup($backup_name); + +# Create streaming replica linking to primary +$node_replica = get_new_node('replica'); +$node_replica->init_from_backup($node_primary, $backup_name, + has_streaming => 1); +$node_replica->start; + +# Create ivfflat index on primary +$node_primary->safe_psql("postgres", "CREATE EXTENSION vector;"); +$node_primary->safe_psql("postgres", "CREATE TABLE tst (i int4, v vector($dim));"); +$node_primary->safe_psql("postgres", + "INSERT INTO tst SELECT i % 10, ARRAY[$array_sql] FROM generate_series(1, 100000) i;" +); +$node_primary->safe_psql("postgres", "CREATE INDEX ON tst USING ivfflat (v);"); + +# Test that queries give same result +test_index_replay('initial'); + +# Run 10 cycles of table modification. Run test queries after each modification. +for my $i (1 .. 10) +{ + $node_primary->safe_psql("postgres", "DELETE FROM tst WHERE i = $i;"); + test_index_replay("delete $i"); + $node_primary->safe_psql("postgres", "VACUUM tst;"); + test_index_replay("vacuum $i"); + my ($start, $end) = (100001 + ($i - 1) * 10000, 100000 + $i * 10000); + $node_primary->safe_psql("postgres", + "INSERT INTO tst SELECT i % 10, ARRAY[$array_sql] FROM generate_series($start, $end) i;" + ); + test_index_replay("insert $i"); +} diff --git a/contrib/pgvector/test/t/002_vacuum.pl b/contrib/pgvector/test/t/002_vacuum.pl new file mode 100644 index 0000000000000000000000000000000000000000..1c3d718469bf7fb78d8ce2733e4c5c0bdf73a87f --- /dev/null +++ b/contrib/pgvector/test/t/002_vacuum.pl @@ -0,0 +1,41 @@ +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More tests => 1; + +my $dim = 3; + +my @r = (); +for (1 .. $dim) { + my $v = int(rand(1000)) + 1; + push(@r, "i % $v"); +} +my $array_sql = join(", ", @r); + +# Initialize node +my $node = get_new_node('node'); +$node->init; +$node->start; + +# Create table and index +$node->safe_psql("postgres", "CREATE EXTENSION vector;"); +$node->safe_psql("postgres", "CREATE TABLE tst (i int4, v vector($dim));"); +$node->safe_psql("postgres", + "INSERT INTO tst SELECT i % 10, ARRAY[$array_sql] FROM generate_series(1, 100000) i;" +); +$node->safe_psql("postgres", "CREATE INDEX ON tst USING ivfflat (v);"); + +# Get size +my $size = $node->safe_psql("postgres", "SELECT pg_total_relation_size('tst_v_idx');"); + +# Delete all, vacuum, and insert same data +$node->safe_psql("postgres", "DELETE FROM tst;"); +$node->safe_psql("postgres", "VACUUM tst;"); +$node->safe_psql("postgres", + "INSERT INTO tst SELECT i % 10, ARRAY[$array_sql] FROM generate_series(1, 100000) i;" +); + +# Check size +my $new_size = $node->safe_psql("postgres", "SELECT pg_total_relation_size('tst_v_idx');"); +is($size, $new_size, "size does not change"); diff --git a/contrib/pgvector/test/t/003_recall.pl b/contrib/pgvector/test/t/003_recall.pl new file mode 100644 index 0000000000000000000000000000000000000000..dddc4d5e07648bc4491ef6f4d2c96cab7600ff07 --- /dev/null +++ b/contrib/pgvector/test/t/003_recall.pl @@ -0,0 +1,88 @@ +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More tests => 9; + +my $node; +my @queries = (); +my @expected; +my $limit = 20; + +sub test_recall +{ + my ($probes, $min, $operator) = @_; + my $correct = 0; + my $total = 0; + + for my $i (0 .. $#queries) { + my $actual = $node->safe_psql("postgres", qq( + SET enable_seqscan = off; + SET ivfflat.probes = $probes; + SELECT i FROM tst ORDER BY v $operator '$queries[$i]' LIMIT $limit; + )); + my @actual_ids = split("\n", $actual); + my %actual_set = map { $_ => 1 } @actual_ids; + + my @expected_ids = split("\n", $expected[$i]); + + foreach (@expected_ids) { + if (exists($actual_set{$_})) { + $correct++; + } + $total++; + } + } + + cmp_ok($correct / $total, ">=", $min, $operator); +} + +# Initialize node +$node = get_new_node('node'); +$node->init; +$node->start; + +# Create table +$node->safe_psql("postgres", "CREATE EXTENSION vector;"); +$node->safe_psql("postgres", "CREATE TABLE tst (i int4, v vector(3));"); +$node->safe_psql("postgres", + "INSERT INTO tst SELECT i, ARRAY[random(), random(), random()] FROM generate_series(1, 100000) i;" +); + +# Generate queries +for (1..20) { + my $r1 = rand(); + my $r2 = rand(); + my $r3 = rand(); + push(@queries, "[$r1,$r2,$r3]"); +} + +# Check each index type +my @operators = ("<->", "<#>", "<=>"); + +foreach (@operators) { + my $operator = $_; + + # Get exact results + @expected = (); + foreach (@queries) { + my $res = $node->safe_psql("postgres", "SELECT i FROM tst ORDER BY v $operator '$_' LIMIT $limit;"); + push(@expected, $res); + } + + # Add index + my $opclass; + if ($operator == "<->") { + $opclass = "vector_l2_ops"; + } elsif ($operator == "<#>") { + $opclass = "vector_ip_ops"; + } else { + $opclass = "vector_cosine_ops"; + } + $node->safe_psql("postgres", "CREATE INDEX ON tst USING ivfflat (v $opclass);"); + + # Test approximate results + test_recall(1, 0.75, $operator); + test_recall(10, 0.95, $operator); + test_recall(100, 1.0, $operator); +} diff --git a/contrib/pgvector/test/t/004_centers.pl b/contrib/pgvector/test/t/004_centers.pl new file mode 100644 index 0000000000000000000000000000000000000000..9c2b53a5a716b51fbed193a63bda66b981feb9bc --- /dev/null +++ b/contrib/pgvector/test/t/004_centers.pl @@ -0,0 +1,36 @@ +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More tests => 3; + +# Initialize node +my $node = get_new_node('node'); +$node->init; +$node->start; + +# Create table +$node->safe_psql("postgres", "CREATE EXTENSION vector;"); +$node->safe_psql("postgres", "CREATE TABLE tst (i int4, v vector(3));"); +$node->safe_psql("postgres", + "INSERT INTO tst SELECT i, '[1,2,3]' FROM generate_series(1, 10) i;" +); + +sub test_centers +{ + my ($lists, $min) = @_; + + my ($ret, $stdout, $stderr) = $node->psql("postgres", "CREATE INDEX ON tst USING ivfflat (v) WITH (lists = $lists);"); + is($ret, 0, $stderr); +} + +# Test no error for duplicate centers +test_centers(5); +test_centers(10); + +$node->safe_psql("postgres", + "INSERT INTO tst SELECT i, '[4,5,6]' FROM generate_series(1, 10) i;" +); + +# Test no error for duplicate centers +test_centers(10); diff --git a/contrib/pgvector/test/t/005_query_recall.pl b/contrib/pgvector/test/t/005_query_recall.pl new file mode 100644 index 0000000000000000000000000000000000000000..0e58135a383a25b7e781fe373c8e780665c72d09 --- /dev/null +++ b/contrib/pgvector/test/t/005_query_recall.pl @@ -0,0 +1,45 @@ +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More tests => 60; + +# Initialize node +my $node = get_new_node('node'); +$node->init; +$node->start; + +# Create table +$node->safe_psql("postgres", "CREATE EXTENSION vector;"); +$node->safe_psql("postgres", "CREATE TABLE tst (i int4 primary key, v vector(3));"); +$node->safe_psql("postgres", + "INSERT INTO tst SELECT i, ARRAY[random(), random(), random()] FROM generate_series(1, 100000) i;" +); + +# Check each index type +my @operators = ("<->", "<#>", "<=>"); +foreach (@operators) { + my $operator = $_; + + # Add index + my $opclass; + if ($operator == "<->") { + $opclass = "vector_l2_ops"; + } elsif ($operator == "<#>") { + $opclass = "vector_ip_ops"; + } else { + $opclass = "vector_cosine_ops"; + } + $node->safe_psql("postgres", "CREATE INDEX ON tst USING ivfflat (v $opclass);"); + + # Test 100% recall + for (1..20) { + my $i = int(rand() * 100000); + my $query = $node->safe_psql("postgres", "SELECT v FROM tst WHERE i = $i;"); + my $res = $node->safe_psql("postgres", qq( + SET enable_seqscan = off; + SELECT v FROM tst ORDER BY v <-> '$query' LIMIT 1; + )); + is($res, $query); + } +} diff --git a/contrib/pgvector/test/t/006_lists.pl b/contrib/pgvector/test/t/006_lists.pl new file mode 100644 index 0000000000000000000000000000000000000000..eeb11aa3ca7157003ebdaf0d5a885d098e22748d --- /dev/null +++ b/contrib/pgvector/test/t/006_lists.pl @@ -0,0 +1,31 @@ +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More tests => 3; + +# Initialize node +my $node = get_new_node('node'); +$node->init; +$node->start; + +# Create table +$node->safe_psql("postgres", "CREATE EXTENSION vector;"); +$node->safe_psql("postgres", "CREATE TABLE tst (v vector(3));"); +$node->safe_psql("postgres", + "INSERT INTO tst SELECT ARRAY[random(), random(), random()] FROM generate_series(1, 100000) i;" +); + +$node->safe_psql("postgres", "CREATE INDEX lists50 ON tst USING ivfflat (v) WITH (lists = 50);"); +$node->safe_psql("postgres", "CREATE INDEX lists100 ON tst USING ivfflat (v) WITH (lists = 100);"); + +# Test prefers more lists +my $res = $node->safe_psql("postgres", "EXPLAIN SELECT v FROM tst ORDER BY v <-> '[0.5,0.5,0.5]' LIMIT 10;"); +like($res, qr/lists100/); +unlike($res, qr/lists50/); + +# Test errors with too much memory +my ($ret, $stdout, $stderr) = $node->psql("postgres", + "CREATE INDEX lists10000 ON tst USING ivfflat (v) WITH (lists = 10000);" +); +like($stderr, qr/memory required is/); diff --git a/contrib/pgvector/test/t/007_inserts.pl b/contrib/pgvector/test/t/007_inserts.pl new file mode 100644 index 0000000000000000000000000000000000000000..73d77b58e9b6bc0cce039619c0f738e6e186ab27 --- /dev/null +++ b/contrib/pgvector/test/t/007_inserts.pl @@ -0,0 +1,55 @@ +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More tests => 7; + +my $dim = 768; + +my $array_sql = join(",", ('random()') x $dim); + +# Initialize node +my $node = get_new_node('node'); +$node->init; +$node->start; + +# Create table and index +$node->safe_psql("postgres", "CREATE EXTENSION vector;"); +$node->safe_psql("postgres", "CREATE TABLE tst (v vector($dim));"); +$node->safe_psql("postgres", + "INSERT INTO tst SELECT ARRAY[$array_sql] FROM generate_series(1, 10000) i;" +); +$node->safe_psql("postgres", "CREATE INDEX ON tst USING ivfflat (v);"); + +$node->pgbench( + "--no-vacuum --client=5 --transactions=100", + 0, + [qr{actually processed}], + [qr{^$}], + "concurrent INSERTs", + { + "007_inserts" => "INSERT INTO tst SELECT ARRAY[$array_sql] FROM generate_series(1, 10) i;" + } +); + +sub idx_scan +{ + # Stats do not update instantaneously + # https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-STATS-VIEWS + sleep(1); + $node->safe_psql("postgres", "SELECT idx_scan FROM pg_stat_user_indexes WHERE indexrelid = 'tst_v_idx'::regclass;"); +} + +my $expected = 10000 + 5 * 100 * 10; + +my $count = $node->safe_psql("postgres", "SELECT COUNT(*) FROM tst;"); +is($count, $expected); +is(idx_scan(), 0); + +$count = $node->safe_psql("postgres", qq( + SET enable_seqscan = off; + SET ivfflat.probes = 100; + SELECT COUNT(*) FROM (SELECT v FROM tst ORDER BY v <-> (SELECT v FROM tst LIMIT 1)) t; +)); +is($count, $expected); +is(idx_scan(), 1); diff --git a/contrib/pgvector/test/t/008_avg.pl b/contrib/pgvector/test/t/008_avg.pl new file mode 100644 index 0000000000000000000000000000000000000000..f03df617df286faccec98e5455b05b3360d26ba1 --- /dev/null +++ b/contrib/pgvector/test/t/008_avg.pl @@ -0,0 +1,35 @@ +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More tests => 5; + +# Initialize node +my $node = get_new_node('node'); +$node->init; +$node->start; + +# Create table +$node->safe_psql("postgres", "CREATE EXTENSION vector;"); +$node->safe_psql("postgres", "CREATE TABLE tst (r1 real, r2 real, r3 real, v vector(3));"); +$node->safe_psql("postgres", qq( + INSERT INTO tst SELECT r1, r2, r3, ARRAY[r1, r2, r3] FROM ( + SELECT random() + 1.01 AS r1, random() + 2.01 AS r2, random() + 3.01 AS r3 FROM generate_series(1, 1000000) t + ) i; +)); + +# Test avg +my $avg = $node->safe_psql("postgres", "SELECT AVG(v) FROM tst;"); +like($avg, qr/\[1\.5/); +like($avg, qr/,2\.5/); +like($avg, qr/,3\.5/); + +# Test matches real +my $r1 = $node->safe_psql("postgres", "SELECT AVG(r1)::float4 FROM tst;"); +my $r2 = $node->safe_psql("postgres", "SELECT AVG(r2)::float4 FROM tst;"); +my $r3 = $node->safe_psql("postgres", "SELECT AVG(r3)::float4 FROM tst;"); +is($avg, "[$r1,$r2,$r3]"); + +# Test explain +my $explain = $node->safe_psql("postgres", "EXPLAIN SELECT AVG(v) FROM tst;"); +like($explain, qr/Partial Aggregate/); diff --git a/contrib/pgvector/test/t/009_storage.pl b/contrib/pgvector/test/t/009_storage.pl new file mode 100644 index 0000000000000000000000000000000000000000..de818c78b46dc27346d3711722e501b20f5d96d7 --- /dev/null +++ b/contrib/pgvector/test/t/009_storage.pl @@ -0,0 +1,32 @@ +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More tests => 1; + +my $dim = 1024; + +# Initialize node +my $node = get_new_node('node'); +$node->init; +$node->start; + +# Create table +$node->safe_psql("postgres", "CREATE EXTENSION vector;"); +$node->safe_psql("postgres", "CREATE TABLE tst (v1 vector(1024), v2 vector(1024), v3 vector(1024));"); + +# Test insert succeeds +$node->safe_psql("postgres", + "INSERT INTO tst SELECT array_agg(n), array_agg(n), array_agg(n) FROM generate_series(1, $dim) n" +); + +# Change storage to PLAIN +$node->safe_psql("postgres", "ALTER TABLE tst ALTER COLUMN v1 SET STORAGE PLAIN"); +$node->safe_psql("postgres", "ALTER TABLE tst ALTER COLUMN v2 SET STORAGE PLAIN"); +$node->safe_psql("postgres", "ALTER TABLE tst ALTER COLUMN v3 SET STORAGE PLAIN"); + +# Test insert fails +my ($ret, $stdout, $stderr) = $node->psql("postgres", + "INSERT INTO tst SELECT array_agg(n), array_agg(n), array_agg(n) FROM generate_series(1, $dim) n" +); +like($stderr, qr/row is too big/); diff --git a/contrib/pgvector/vector.control b/contrib/pgvector/vector.control new file mode 100644 index 0000000000000000000000000000000000000000..fe1f94e0091ce773f62a2122721b8cb4aee533e3 --- /dev/null +++ b/contrib/pgvector/vector.control @@ -0,0 +1,4 @@ +comment = 'vector data type and ivfflat access method' +default_version = '0.4.4' +module_pathname = '$libdir/vector' +relocatable = true