AVX512 modifications for LDPC encoding: interleaving, ZC384 BG1, output...
AVX512 modifications for LDPC encoding: interleaving, ZC384 BG1, output formatting for encoder, segmentation (memcpy instead of loop). some improvement in TX for aarch64 in same places where AVX512 support was added. Also, rate matching and interleaving are done on bytes containing 8 segments and reformatting of the output is done at the end of segment processing instead of after ldpc encoding.
This improves the overall performance of the NR DL transmitter in gNB. Here is a summary of times on some machines at EURECOM (note: rk_integration1224 is not an MR, it is this MR combined with MR !3127) matix = 5.9 GHz, Ryzen Gen4, peafowl = 4.1 GHz EPYC 9374F, stupix = 3.6 GHz Xeon Gold 6354, broadbill = 3.0 GHz EPYC 8534P falcon-gh200 = 3.6 GHz Nvidia gh200
sudo ./nr_dlsim -n100 -P -x2 -y4 -z4 -R273 -b273 -e 25 -s30
ldpc_enc_avx512 (matix) 280 us
develop (matix) 394.56 us
ldpc_enc_avx512 (peafowl) 431 us
ldpc_enc_avx512 (peafowl with T2) 311 us
develop (peafowl) 554.98 us
ldpc_enc_avx512 (falcon-gh200) 524 us
develop (falcon-gh200) 809.76 us
rk_integration1224 (peafowl) 375 us
rk_integration1224 (matix) 258 us
rk_integration1224 (stupix) 554 us
rk_integration1224 (stupix, --noavx512) 546 us
rk_integration1224 (broadbill) 544us
rk_integration1224 (broadbill, --noavx512) 508 us
rk_integration1224 (falcon-gh200) 417 us
sudo ./nr_dlsim -n100 -P -x2 -y4 -z4 -R273 -b273 -e 25 -s30 -X 8,9,10,11,12
ldpc_enc_avx512 (peafowl) 360 us
ldpc_enc_avx512 (peafowl with T2) 311 us
develop (peafowl) 407.75 us
ldpc_enc_avx512 (matix) 248 us (-1,-1,-1,-1-,1)
develop (matix) 383.98 (-1,-1,-1,-1,-1)
ldpc_enc_avx512 (falcon-gh200) 388 us (4,5,6,7,8)
develop (falcon-gh200) 442 us (4,5,6,7,8)
rk_integration1224 (peafowl) 327 us
rk_integration1224 (matix) 204 us
rk_integration1224 (stupix) 496 us
rk_integration1224 (stupix, --noavx512) 485 us
rk_integration1224 (broadbill) 418 us
rk_integration1224 (broadbill, --noavx512) 407 us
rk_integration1224 (falcon-gh200) 275 us (4,5,6,7,8)
sudo ./nr_dlsim -n100 -P -x2 -y4 -z4 -R273 -b273 -e 25 -s30 -q1
ldpc_enc_avx512 (matix) 348 us
develop (matix) 533.98 us
ldpc_enc_avx512 (peafowl) 550 us
ldpc_enc_avx512 (peafowl with T2) 351 us
develop (peafowl) 735.42 us
ldpc_env_avx512 (falcon-gh200) 646 us
develop (falcon-gh200) 1089 us
rk_integration1224 (peafowl) 480 us
rk_integration1224 (matix) 393 us
rk_integration1224 (stupix) 660 us
rk_integration1224 (stupix, --noavx512) 657 us
rk_integration1224 (broadbill) 661 us
rk_integration1224 (broadbill, --noavx512) 627 us
rk_integration1224 (falcon-gh200) 536 us
sudo ./nr_dlsim -n100 -P -x2 -y4 -z4 -R273 -b273 -e 25 -s30 -q1 -X 8,9,10,11,12
ldpc_enc_avx512 (matix) 296 us (-1,-1,-1,-1,-1)
develop (matix) 481.90 (-1,-1,-1,-1,-1)
ldpc_enc_avx512 (peafowl) 392 us
ldpc_enc_avx512 (peafowl with T2) 351 us
develop (peafowl) 466.49
ldpc_enc_avx512 (falcon-gh200) 420 us (4,5,6,7,8)
develop (falcon-gh200) 573.81 us (4,5,6,7,8)
rk_integration1224 (peafowl) 329 us
rk_integration1224 (matix) 256 us
rk_integration1224 (stupix) 653 us
rk_integration1224 (stupix, --noavx512) 547us
rk_integration1224 (broadbill) 408us
rk_integration1224 (broadbill, --noavx512) 446us
rk_integration1224 (falcon-gh200) 307 us (4,5,6,7,8)