Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 3 hours ago.
I am trying to use Vector<T>
from System.Numerics in C#.
I have been given a C++ approach to the problem I am trying to solve but I am not very good at C++ and have problem to understand the logic and wonder if anyone have the knowledge of How to convert this code to C#.
My original post that describes this problem and from where I got this code is: Micro Optimization of a 4-bucket histogram of a large array or list
Perhaps without going into to much details of what the code does. Maybe the code will be self explanatory where I try to use Vector
to histogram values from a List<int>/std::vector<int>
as fast as possible where in this example simultaneously processes 8 indexes at the same time using SIMD.
#include <immintrin.h>
#include <vector>
static inline
__m128i hsum_xpose(const __m256i counts[4]) // UNTESTED and not 100% sure this is right
{
// AMD Zen1 might benefit from reducing inputs to 128 first
// but on Zen2 and Intel 256-bit in-lane shuffles are no cheaper than 128
__m256i sum01 = _mm256_hadd_epi32(counts[0], counts[1]);
__m256i sum23 = _mm256_hadd_epi32(counts[2], counts[3]);
__m256i sum0123 = _mm256_hadd_epi32(sum01, sum23); // in-lane hsums
__m128i sum_high = _mm256_extracti128_si256(sum0123, 1); // add high and low lanes
// add upper 128 bits of sum to its lower 128 bits
__m128i result = _mm_add_epi32(sum_high, _mm256_castsi256_si128(sum0123));
return result;
}
void count_elements_avx2(const std::vector<int> &input, unsigned output_counts[4])
{
__m256i counts[4] = { _mm256_setzero_si256() }; // 4 vectors of zeroed counters
// each vector holds counts for one bucket, to be hsummed at the end
size_t size = input.size();
for(size_t i = 0 ; i<size ; i+=8) { // 8x 32-bit elements per vector
__m256i v = _mm256_loadu_si256((const __m256i*)&input[i]); // unaligned load of 8 ints
for (int val = 0 ; val < 3; val++) {
// C++ compilers will unroll this with 3 vector constants and no memory access
__m256i match = _mm256_cmpeq_epi32(v, _mm256_set1_epi32(val)); // 0 or all-ones aka -1
counts[val] = _mm256_sub_epi32(counts[val], match); // x -= -1 or 0 conditional increment
}
}
// transpose and sum 4 vectors of 8 elements down to 1 vector of 4 elements
__m128i summed_counts = hsum_xpose(counts); // helper function defined in Godbolt link
_mm_storeu_si128((__m128i*)output_counts, summed_counts);
output_counts[3] = size - output_counts[0]
- output_counts[1] - output_counts[2];
// TODO: handle the last size%8 input elements; scalar would be easy
}
Please login or Register to submit your answer