0 Star 0 Fork 18

wangjie/IvorySQL

forked from IvorySQL/IvorySQL 
加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
pg_popcount_avx512.c 3.99 KB
一键复制 编辑 原始数据 按行查看 历史
Tom Lane 提交于 2024-05-15 04:34 +08:00 . Pre-beta mechanical code beautification.
/*-------------------------------------------------------------------------
*
* pg_popcount_avx512.c
* Holds the AVX-512 pg_popcount() implementation.
*
* Copyright (c) 2024, PostgreSQL Global Development Group
*
* IDENTIFICATION
* src/port/pg_popcount_avx512.c
*
*-------------------------------------------------------------------------
*/
#include "c.h"
#include <immintrin.h>
#include "port/pg_bitutils.h"
/*
* It's probably unlikely that TRY_POPCNT_FAST won't be set if we are able to
* use AVX-512 intrinsics, but we check it anyway to be sure. We piggy-back on
* the function pointers that are only used when TRY_POPCNT_FAST is set.
*/
#ifdef TRY_POPCNT_FAST
/*
* pg_popcount_avx512
* Returns the number of 1-bits in buf
*/
uint64
pg_popcount_avx512(const char *buf, int bytes)
{
__m512i val,
cnt;
__m512i accum = _mm512_setzero_si512();
const char *final;
int tail_idx;
__mmask64 mask = ~UINT64CONST(0);
/*
* Align buffer down to avoid double load overhead from unaligned access.
* Calculate a mask to ignore preceding bytes. Find start offset of final
* iteration and ensure it is not empty.
*/
mask <<= ((uintptr_t) buf) % sizeof(__m512i);
tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
/*
* Iterate through all but the final iteration. Starting from the second
* iteration, the mask is ignored.
*/
if (buf < final)
{
val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
cnt = _mm512_popcnt_epi64(val);
accum = _mm512_add_epi64(accum, cnt);
buf += sizeof(__m512i);
mask = ~UINT64CONST(0);
for (; buf < final; buf += sizeof(__m512i))
{
val = _mm512_load_si512((const __m512i *) buf);
cnt = _mm512_popcnt_epi64(val);
accum = _mm512_add_epi64(accum, cnt);
}
}
/* Final iteration needs to ignore bytes that are not within the length */
mask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
cnt = _mm512_popcnt_epi64(val);
accum = _mm512_add_epi64(accum, cnt);
return _mm512_reduce_add_epi64(accum);
}
/*
* pg_popcount_masked_avx512
* Returns the number of 1-bits in buf after applying the mask to each byte
*/
uint64
pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask)
{
__m512i val,
vmasked,
cnt;
__m512i accum = _mm512_setzero_si512();
const char *final;
int tail_idx;
__mmask64 bmask = ~UINT64CONST(0);
const __m512i maskv = _mm512_set1_epi8(mask);
/*
* Align buffer down to avoid double load overhead from unaligned access.
* Calculate a mask to ignore preceding bytes. Find start offset of final
* iteration and ensure it is not empty.
*/
bmask <<= ((uintptr_t) buf) % sizeof(__m512i);
tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
/*
* Iterate through all but the final iteration. Starting from the second
* iteration, the mask is ignored.
*/
if (buf < final)
{
val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);
vmasked = _mm512_and_si512(val, maskv);
cnt = _mm512_popcnt_epi64(vmasked);
accum = _mm512_add_epi64(accum, cnt);
buf += sizeof(__m512i);
bmask = ~UINT64CONST(0);
for (; buf < final; buf += sizeof(__m512i))
{
val = _mm512_load_si512((const __m512i *) buf);
vmasked = _mm512_and_si512(val, maskv);
cnt = _mm512_popcnt_epi64(vmasked);
accum = _mm512_add_epi64(accum, cnt);
}
}
/* Final iteration needs to ignore bytes that are not within the length */
bmask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);
vmasked = _mm512_and_si512(val, maskv);
cnt = _mm512_popcnt_epi64(vmasked);
accum = _mm512_add_epi64(accum, cnt);
return _mm512_reduce_add_epi64(accum);
}
#endif /* TRY_POPCNT_FAST */
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
C
1
https://gitee.com/Jugier/IvorySQL.git
git@gitee.com:Jugier/IvorySQL.git
Jugier
IvorySQL
IvorySQL
master

搜索帮助