diff --git a/config/m4sme_e/bli_cntx_init_m4sme_e.c b/config/m4sme_e/bli_cntx_init_m4sme_e.c new file mode 100644 index 000000000..251969eca --- /dev/null +++ b/config/m4sme_e/bli_cntx_init_m4sme_e.c @@ -0,0 +1,199 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_cntx_init_m4sme_e( cntx_t* cntx ) +{ + blksz_t blkszs[ BLIS_NUM_BLKSZS ]; + + // Set default kernel blocksizes and functions. + bli_cntx_init_m4sme_e_ref( cntx ); + + // ------------------------------------------------------------------------- +#if 0 + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsme_int_2SVLx2SVL, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsme_int_4SVLx2SVL, + BLIS_PACKM_KER, BLIS_FLOAT, bli_spackm_armsme_int_2SVLx2SVL, + BLIS_PACKM_KER, BLIS_DOUBLE, bli_dpackm_armsme_int_4SVLx2SVL, + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + + BLIS_VA_END + ); + + // Initialize level-3 blocksize objects with architecture-specific values. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 32, 32, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 32, 16, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 192, 608, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 2112, 1120, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 10240, 3072, -1, -1 ); +#endif + +#if 1 +// Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsme_int_SVLx4SVL, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsme_int_SVLx8SVL, + BLIS_PACKM_KER, BLIS_FLOAT, bli_spackm_armsme_int_SVLx4SVL, + BLIS_PACKM_KER, BLIS_DOUBLE, bli_dpackm_armsme_int_SVLx8SVL, + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + BLIS_VA_END + ); + + // Initialize level-3 blocksize objects with architecture-specific values. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 16, 8, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 64, 64, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 768, 512, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 1472, 1024, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 10240, 3072, -1, -1 ); +#endif + +#if 0 + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsme_int_4SVLxSVL, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsme_int_8SVLxSVL, + BLIS_PACKM_KER, BLIS_FLOAT, bli_spackm_armsme_int_SVLx4SVL, + BLIS_PACKM_KER, BLIS_DOUBLE, bli_dpackm_armsme_int_SVLx8SVL, + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + + BLIS_VA_END + ); + + // Initialize level-3 blocksize objects with architecture-specific values. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 64, 64, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 640, 512, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 1024, 1024, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 10240, 3072, -1, -1 ); +#endif +#if 0 + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsme_int_2SVLx2SVL, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsme_int_2SVLx4SVL, + BLIS_PACKM_KER, BLIS_FLOAT, bli_spackm_armsme_int_2SVLx2SVL, + BLIS_PACKM_KER, BLIS_DOUBLE, bli_dpackm_armsme_int_4SVLx2SVL, + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + BLIS_VA_END + ); + + // Initialize level-3 blocksize objects with architecture-specific values. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 32, 16, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 32, 32, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 192, 608, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 2112, 1120, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 10240, 3072, -1, -1 ); +#endif + + // Update the context with the current architecture's register and cache + // blocksizes (and multiples) for native execution. + bli_cntx_set_blkszs + ( + cntx, + + // level-3 + BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, + BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, + BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, + BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, + BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + + BLIS_VA_END + ); +} diff --git a/config/m4sme_e/bli_family_m4sme_e.h b/config/m4sme_e/bli_family_m4sme_e.h new file mode 100644 index 000000000..5ea9dbc1d --- /dev/null +++ b/config/m4sme_e/bli_family_m4sme_e.h @@ -0,0 +1,46 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_FAMILY_H +//#define BLIS_FAMILY_H + + +// -- MEMORY ALLOCATION -------------------------------------------------------- + +#define BLIS_SIMD_ALIGN_SIZE 64 + +#define BLIS_SIMD_MAX_SIZE 256 +#define BLIS_SIMD_MAX_NUM_REGISTERS 32 + + diff --git a/config/m4sme_e/bli_kernel_defs_m4sme_e.h b/config/m4sme_e/bli_kernel_defs_m4sme_e.h new file mode 100644 index 000000000..945ed0b6d --- /dev/null +++ b/config/m4sme_e/bli_kernel_defs_m4sme_e.h @@ -0,0 +1,70 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- +#if 0 +#define BLIS_MR_s 32 +#define BLIS_MR_d 32 + +#define BLIS_NR_s 32 +#define BLIS_NR_d 16 +#endif + +#if 1 +#define BLIS_MR_s 16 +#define BLIS_MR_d 8 + +#define BLIS_NR_s 64 +#define BLIS_NR_d 64 +#endif + +#if 0 +#define BLIS_MR_s 64 +#define BLIS_MR_d 64 + +#define BLIS_NR_s 16 +#define BLIS_NR_d 8 +#endif + +#if 0 +#define BLIS_MR_s 32 +#define BLIS_MR_d 16 + +#define BLIS_NR_s 32 +#define BLIS_NR_d 32 +#endif diff --git a/config/m4sme_e/make_defs.mk b/config/m4sme_e/make_defs.mk new file mode 100644 index 000000000..a7966a62b --- /dev/null +++ b/config/m4sme_e/make_defs.mk @@ -0,0 +1,92 @@ +# +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2014, The University of Texas at Austin +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name(s) of the copyright holder(s) nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + + +# Declare the name of the current configuration and add it to the +# running list of configurations included by common.mk. +THIS_CONFIG := m4sme_e +#CONFIGS_INCL += $(THIS_CONFIG) + +# +# --- Determine the C compiler and related flags --- +# + +# NOTE: The build system will append these variables with various +# general-purpose/configuration-agnostic flags in common.mk. You +# may specify additional flags here as needed. +CPPROCFLAGS := -D_GNU_SOURCE +CMISCFLAGS := -O3 -std=c99 -march=native+sme2+sme-f64f64 -fno-exceptions -fno-rtti -mno-unaligned-access -I /opt/homebrew/opt/libomp/include +CPICFLAGS := -fPIC +CWARNFLAGS := + +ifneq ($(DEBUG_TYPE),off) +CDBGFLAGS := -g +endif + +ifeq ($(DEBUG_TYPE),noopt) +COPTFLAGS := -O3 +else +COPTFLAGS := -O3 -std=c99 -march=native+sme2+sme-f64f64 -fno-exceptions -fno-rtti -mno-unaligned-access + +endif + +# Flags specific to optimized kernels. +ifeq ($(CC_VENDOR),gcc) +CKVECFLAGS := -march=native+sme2+sme-f64f64 -fno-exceptions -fno-rtti -mno-unaligned-access -fno-builtin +else +ifeq ($(CC_VENDOR),clang) +CKVECFLAGS := -O3 -march=native+sme2+sme-f64f64 -fno-exceptions -fno-rtti -mno-unaligned-access -fno-builtin +else +$(error gcc or clang is required for this configuration.) +endif +endif + +# Flags specific to reference kernels. +CROPTFLAGS := $(CKOPTFLAGS) +ifeq ($(CC_VENDOR),gcc) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +ifeq ($(CC_VENDOR),clang) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +CRVECFLAGS := $(CKVECFLAGS) +endif +endif + +# Store all of the variables here to new variables containing the +# configuration name. +$(eval $(call store-make-defs,$(THIS_CONFIG))) + + + diff --git a/config/m4sme_p/bli_cntx_init_m4sme_p.c b/config/m4sme_p/bli_cntx_init_m4sme_p.c new file mode 100644 index 000000000..8db939c37 --- /dev/null +++ b/config/m4sme_p/bli_cntx_init_m4sme_p.c @@ -0,0 +1,199 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_cntx_init_m4sme_p( cntx_t* cntx ) +{ + blksz_t blkszs[ BLIS_NUM_BLKSZS ]; + + // Set default kernel blocksizes and functions. + bli_cntx_init_m4sme_p_ref( cntx ); + + // ------------------------------------------------------------------------- +#if 1 + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsme_int_2SVLx2SVL, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsme_int_4SVLx2SVL, + BLIS_PACKM_KER, BLIS_FLOAT, bli_spackm_armsme_int_2SVLx2SVL, + BLIS_PACKM_KER, BLIS_DOUBLE, bli_dpackm_armsme_int_4SVLx2SVL, + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + + BLIS_VA_END + ); + + // Initialize level-3 blocksize objects with architecture-specific values. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 32, 32, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 32, 16, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 768, 608, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 3648, 2240, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 10240, 3072, -1, -1 ); +#endif + +#if 0 +// Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsme_int_SVLx4SVL, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsme_int_SVLx8SVL, + BLIS_PACKM_KER, BLIS_FLOAT, bli_spackm_armsme_int_SVLx4SVL, + BLIS_PACKM_KER, BLIS_DOUBLE, bli_dpackm_armsme_int_SVLx8SVL, + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + BLIS_VA_END + ); + + // Initialize level-3 blocksize objects with architecture-specific values. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 16, 8, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 64, 64, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 512, 512, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 4032, 2016, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 10240, 3072, -1, -1 ); +#endif + +#if 0 + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsme_int_4SVLxSVL, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsme_int_8SVLxSVL, + BLIS_PACKM_KER, BLIS_FLOAT, bli_spackm_armsme_int_SVLx4SVL, + BLIS_PACKM_KER, BLIS_DOUBLE, bli_dpackm_armsme_int_SVLx8SVL, + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + + BLIS_VA_END + ); + + // Initialize level-3 blocksize objects with architecture-specific values. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 64, 64, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 512, 512, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 4032, 2016, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 10240, 3072, -1, -1 ); +#endif +#if 0 + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsme_int_2SVLx2SVL, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsme_int_2SVLx4SVL, + BLIS_PACKM_KER, BLIS_FLOAT, bli_spackm_armsme_int_2SVLx2SVL, + BLIS_PACKM_KER, BLIS_DOUBLE, bli_dpackm_armsme_int_4SVLx2SVL, + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + BLIS_VA_END + ); + + // Initialize level-3 blocksize objects with architecture-specific values. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 32, 16, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 32, 32, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 768, 608, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 3648, 2240, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 10240, 3072, -1, -1 ); +#endif + + // Update the context with the current architecture's register and cache + // blocksizes (and multiples) for native execution. + bli_cntx_set_blkszs + ( + cntx, + + // level-3 + BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, + BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, + BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, + BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, + BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + + BLIS_VA_END + ); +} diff --git a/config/m4sme_p/bli_family_m4sme_p.h b/config/m4sme_p/bli_family_m4sme_p.h new file mode 100644 index 000000000..5ea9dbc1d --- /dev/null +++ b/config/m4sme_p/bli_family_m4sme_p.h @@ -0,0 +1,46 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_FAMILY_H +//#define BLIS_FAMILY_H + + +// -- MEMORY ALLOCATION -------------------------------------------------------- + +#define BLIS_SIMD_ALIGN_SIZE 64 + +#define BLIS_SIMD_MAX_SIZE 256 +#define BLIS_SIMD_MAX_NUM_REGISTERS 32 + + diff --git a/config/m4sme_p/bli_kernel_defs_m4sme_p.h b/config/m4sme_p/bli_kernel_defs_m4sme_p.h new file mode 100644 index 000000000..8e6d7128c --- /dev/null +++ b/config/m4sme_p/bli_kernel_defs_m4sme_p.h @@ -0,0 +1,70 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- +#if 1 +#define BLIS_MR_s 32 +#define BLIS_MR_d 32 + +#define BLIS_NR_s 32 +#define BLIS_NR_d 16 +#endif + +#if 0 +#define BLIS_MR_s 16 +#define BLIS_MR_d 8 + +#define BLIS_NR_s 64 +#define BLIS_NR_d 64 +#endif + +#if 0 +#define BLIS_MR_s 64 +#define BLIS_MR_d 64 + +#define BLIS_NR_s 16 +#define BLIS_NR_d 8 +#endif + +#if 0 +#define BLIS_MR_s 32 +#define BLIS_MR_d 16 + +#define BLIS_NR_s 32 +#define BLIS_NR_d 32 +#endif diff --git a/config/m4sme_p/make_defs.mk b/config/m4sme_p/make_defs.mk new file mode 100644 index 000000000..4d3daeb43 --- /dev/null +++ b/config/m4sme_p/make_defs.mk @@ -0,0 +1,92 @@ +# +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2014, The University of Texas at Austin +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name(s) of the copyright holder(s) nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + + +# Declare the name of the current configuration and add it to the +# running list of configurations included by common.mk. +THIS_CONFIG := m4sme_p +#CONFIGS_INCL += $(THIS_CONFIG) + +# +# --- Determine the C compiler and related flags --- +# + +# NOTE: The build system will append these variables with various +# general-purpose/configuration-agnostic flags in common.mk. You +# may specify additional flags here as needed. +CPPROCFLAGS := -D_GNU_SOURCE +CMISCFLAGS := -O3 -std=c99 -march=native+sme2+sme-f64f64 -fno-exceptions -fno-rtti -mno-unaligned-access -I /opt/homebrew/opt/libomp/include +CPICFLAGS := -fPIC +CWARNFLAGS := + +ifneq ($(DEBUG_TYPE),off) +CDBGFLAGS := -g +endif + +ifeq ($(DEBUG_TYPE),noopt) +COPTFLAGS := -O3 +else +COPTFLAGS := -O3 -std=c99 -march=native+sme2+sme-f64f64 -fno-exceptions -fno-rtti -mno-unaligned-access + +endif + +# Flags specific to optimized kernels. +ifeq ($(CC_VENDOR),gcc) +CKVECFLAGS := -march=native+sme2+sme-f64f64 -fno-exceptions -fno-rtti -mno-unaligned-access -fno-builtin +else +ifeq ($(CC_VENDOR),clang) +CKVECFLAGS := -O3 -march=native+sme2+sme-f64f64 -fno-exceptions -fno-rtti -mno-unaligned-access -fno-builtin +else +$(error gcc or clang is required for this configuration.) +endif +endif + +# Flags specific to reference kernels. +CROPTFLAGS := $(CKOPTFLAGS) +ifeq ($(CC_VENDOR),gcc) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +ifeq ($(CC_VENDOR),clang) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +CRVECFLAGS := $(CKVECFLAGS) +endif +endif + +# Store all of the variables here to new variables containing the +# configuration name. +$(eval $(call store-make-defs,$(THIS_CONFIG))) + + + diff --git a/config_registry b/config_registry index 815439348..b47b314f2 100644 --- a/config_registry +++ b/config_registry @@ -12,7 +12,7 @@ x86_64: intel64 amd64 amd64_legacy intel64: skx knl haswell sandybridge penryn generic amd64_legacy: excavator steamroller piledriver bulldozer generic amd64: zen3 zen2 zen generic -arm64: armsve firestorm thunderx2 cortexa57 cortexa53 generic +arm64: armsve firestorm thunderx2 cortexa57 cortexa53 m4sme_p m4sme_e generic arm32: cortexa15 cortexa9 generic power: power10 power9 generic @@ -35,6 +35,8 @@ bulldozer: bulldozer # ARM architectures. armsve: armsve/armsve a64fx: a64fx/armsve +m4sme_p: m4sme_p/armsme +m4sme_e: m4sme_e/armsme # ARM Neon64 (4 pipes x 128b) architectures. altramax: altramax/armv8a diff --git a/configure b/configure index de7b80041..18c546be7 100755 --- a/configure +++ b/configure @@ -1876,6 +1876,12 @@ check_compiler() fi fi + # sme2 clang support + if [[ ! ( "$(uname -s)" == "Darwin" && ${cc_vendor} = clang && ${cc_major} -ge 17 ) ]]; then + blacklistcc_add "m4sme_p" + blacklistcc_add "m4sme_e" + fi + # clang if [[ ${cc_vendor} = clang ]]; then if [[ ${vendor_string} = *Apple* ]]; then diff --git a/frame/base/bli_arch.c b/frame/base/bli_arch.c index 776bb698b..435c4cca3 100644 --- a/frame/base/bli_arch.c +++ b/frame/base/bli_arch.c @@ -252,6 +252,13 @@ arch_t bli_arch_query_id_impl( void ) #endif // ARM microarchitectures. + #ifdef BLIS_FAMILY_M4SME_P + id = BLIS_ARCH_M4SME_P; + #endif + #ifdef BLIS_FAMILY_M4SME_E + id = BLIS_ARCH_M4SME_E; + #endif + #ifdef BLIS_FAMILY_ARMSVE id = BLIS_ARCH_ARMSVE; #endif @@ -363,6 +370,9 @@ static const char* config_name[ BLIS_NUM_ARCHS ] = "piledriver", "bulldozer", + "m4sme_p", + "m4sme_e", + "armsve", "a64fx", diff --git a/frame/base/bli_cpuid.c b/frame/base/bli_cpuid.c index 08555b194..b01f75209 100644 --- a/frame/base/bli_cpuid.c +++ b/frame/base/bli_cpuid.c @@ -1024,8 +1024,9 @@ et al #endif //__linux__ #ifdef __APPLE__ +#define _DARWIN_C_SOURCE #include -// #include +#include #endif static uint32_t get_coretype @@ -1069,6 +1070,13 @@ static uint32_t get_coretype // FIXME: compute actual part number implementer = 0x61; //Apple part = 0x023; //Firestorm + int sme2_supported = 0; + size_t len = sizeof(sme2_supported); + if (sysctlbyname("hw.optional.arm.FEAT_SME2", &sme2_supported, &len, NULL, 0) == 0) { + if (sme2_supported) { + *features |= FEATURE_SME2; + } + } #endif //__APPLE__ // From Linux arch/arm64/include/asm/cputype.h @@ -1139,6 +1147,10 @@ static uint32_t get_coretype #define APPLE_CPU_PART_BLIZZARD_MAX 0x038 #define APPLE_CPU_PART_AVALANCHE_MAX 0x039 +#ifdef BLIS_CONFIG_M4SME_P + if (*features & FEATURE_SME2) + return BLIS_ARCH_M4SME_P; +#endif // Fixme: After merging the vpu_count branch we could report the // part here with bli_dolog. diff --git a/frame/base/bli_cpuid.h b/frame/base/bli_cpuid.h index 77bb62ced..5bea8cfae 100644 --- a/frame/base/bli_cpuid.h +++ b/frame/base/bli_cpuid.h @@ -70,6 +70,7 @@ bool bli_cpuid_is_piledriver( uint32_t family, uint32_t model, uint32_t features bool bli_cpuid_is_bulldozer( uint32_t family, uint32_t model, uint32_t features ); // ARM +bool bli_cpuid_is_m4sme_p( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_thunderx2( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa57( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa53( uint32_t model, uint32_t part, uint32_t features ); @@ -183,7 +184,8 @@ enum enum { FEATURE_NEON = 0x01, - FEATURE_SVE = 0x02 + FEATURE_SVE = 0x02, + FEATURE_SME2 = 0x04 }; #endif diff --git a/frame/include/bli_arch_config.h b/frame/include/bli_arch_config.h index 49a894302..99d1a8264 100644 --- a/frame/include/bli_arch_config.h +++ b/frame/include/bli_arch_config.h @@ -120,6 +120,14 @@ INSERT_GENTCONF // -- ARM architectures -- +// ARM-SME +#ifdef BLIS_FAMILY_M4SME_P +#include "bli_family_m4sme_p.h" +#endif +#ifdef BLIS_FAMILY_M4SME_E +#include "bli_family_m4sme_e.h" +#endif + // ARM-SVE #ifdef BLIS_FAMILY_ARMSVE #include "bli_family_armsve.h" @@ -241,6 +249,9 @@ INSERT_GENTCONF // -- ARM architectures -- +#ifdef BLIS_KERNELS_ARMSME +#include "bli_kernels_armsme.h" +#endif #ifdef BLIS_KERNELS_ARMSVE #include "bli_kernels_armsve.h" #endif diff --git a/frame/include/bli_gentconf_macro_defs.h b/frame/include/bli_gentconf_macro_defs.h index f6f3af20e..e75c1cac0 100644 --- a/frame/include/bli_gentconf_macro_defs.h +++ b/frame/include/bli_gentconf_macro_defs.h @@ -43,6 +43,19 @@ // -- configuration-specific macros which are conditionally-enabled -- +// -- Apple M4 architecture ---------------------------------------------------- + +#ifdef BLIS_CONFIG_M4SME_P +#define INSERT_GENTCONF_M4SME_P GENTCONF( M4SME_P, m4sme_p ) +#else +#define INSERT_GENTCONF_M4SME_P +#endif +#ifdef BLIS_CONFIG_M4SME_E +#define INSERT_GENTCONF_M4SME_E GENTCONF( M4SME_E, m4sme_e ) +#else +#define INSERT_GENTCONF_M4SME_E +#endif + // -- Intel architectures ------------------------------------------------------ #ifdef BLIS_CONFIG_SKX @@ -246,6 +259,9 @@ #define INSERT_GENTCONF \ \ +INSERT_GENTCONF_M4SME_P \ +INSERT_GENTCONF_M4SME_E \ +\ INSERT_GENTCONF_SKX \ INSERT_GENTCONF_KNL \ INSERT_GENTCONF_KNC \ diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index 758f9eb30..f2b6b48c0 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -976,6 +976,10 @@ typedef enum arch_e BLIS_ARCH_PILEDRIVER, BLIS_ARCH_BULLDOZER, + // ARM-SME + BLIS_ARCH_M4SME_P, + BLIS_ARCH_M4SME_E, + // ARM-SVE BLIS_ARCH_ARMSVE, BLIS_ARCH_A64FX, diff --git a/kernels/armsme/1m/bli_packm_armsme_int_d4SVLx2SVL.c b/kernels/armsme/1m/bli_packm_armsme_int_d4SVLx2SVL.c new file mode 100644 index 000000000..872f32775 --- /dev/null +++ b/kernels/armsme/1m/bli_packm_armsme_int_d4SVLx2SVL.c @@ -0,0 +1,1180 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, Linaro Limited + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include + +#include "blis.h" + +// MACROS FOR FALLTHROUGH LOGIC + +// PATH 1 + +// 1. Core Read, Shuffle & Store Logic for Pairs +// Reads 4 columns from Top(TA) and Bottom(TB) tiles +// Interleaves: [TB_0, TA_0, TB_1, TA_1] ... +#define OP_VG4_PAIR( TA, TB, TCOL, P_PTR ) \ + { \ + svcount_t p_all = svptrue_c64(); \ + svfloat64x4_t zA = svread_ver_za64_f64_vg4( TA, TCOL ); \ + svfloat64x4_t zB = svread_ver_za64_f64_vg4( TB, TCOL ); \ + \ + /* Shuffle Cols 0 & 1: [B0, A0, B1, A1] */ \ + svfloat64x4_t z0 = svcreate4( svget4( zB, 0 ), svget4( zA, 0 ), \ + svget4( zB, 1 ), svget4( zA, 1 ) ); \ + /* Shuffle Cols 2 & 3: [B2, A2, B3, A3] */ \ + svfloat64x4_t z1 = svcreate4( svget4( zB, 2 ), svget4( zA, 2 ), \ + svget4( zB, 3 ), svget4( zA, 3 ) ); \ + \ + svst1( p_all, P_PTR, z0 ); \ + svst1( p_all, P_PTR + 4 * SVL, z1 ); \ + P_PTR += ( 8 * SVL ); \ + } + +#define OP_VG2_PAIR( TA, TB, TCOL, P_PTR ) \ + { \ + svcount_t p_all = svptrue_c64(); \ + svfloat64x2_t zA = svread_ver_za64_f64_vg2( TA, TCOL ); \ + svfloat64x2_t zB = svread_ver_za64_f64_vg2( TB, TCOL ); \ + \ + /* Shuffle Cols 0 & 1 */ \ + svfloat64x4_t z0 = svcreate4( svget2( zB, 0 ), svget2( zA, 0 ), \ + svget2( zB, 1 ), svget2( zA, 1 ) ); \ + \ + svst1( p_all, P_PTR, z0 ); \ + P_PTR += ( 4 * SVL ); \ + } + +#define OP_VG1_PAIR( TA, TB, TCOL, P_PTR ) \ + { \ + svbool_t p_true = svptrue_b64(); \ + svcount_t p_cnt = svptrue_c64(); \ + \ + svfloat64_t vA = svread_ver_za64_m( vA, p_true, TA, TCOL ); \ + svfloat64_t vB = svread_ver_za64_m( vB, p_true, TB, TCOL ); \ + \ + /* Store as pair [B, A] (2 vectors total) */ \ + svfloat64x2_t z0 = svcreate2( vB, vA ); \ + svst1( p_cnt, P_PTR, z0 ); \ + P_PTR += ( 2 * SVL ); \ + } + +// 2. Duff's Device Logic for a Partial Pair Tile +#define PROCESS_PARTIAL_PAIR( TA, TB, REM, P_BASE ) \ + { \ + int tcol = 0; \ + double* p_curr = P_BASE; \ + int n4 = REM >> 2; \ + \ + if ( n4 > 0 ) \ + { \ + switch ( n4 & 3 ) \ + { \ + case 0: \ + OP_VG4_PAIR( TA, TB, tcol, p_curr ); \ + tcol += 4; \ + case 3: \ + OP_VG4_PAIR( TA, TB, tcol, p_curr ); \ + tcol += 4; \ + case 2: \ + OP_VG4_PAIR( TA, TB, tcol, p_curr ); \ + tcol += 4; \ + case 1: \ + OP_VG4_PAIR( TA, TB, tcol, p_curr ); \ + tcol += 4; \ + } \ + } \ + switch ( REM & 3 ) \ + { \ + case 3: \ + OP_VG2_PAIR( TA, TB, tcol, p_curr ); \ + tcol += 2; \ + OP_VG1_PAIR( TA, TB, tcol, p_curr ); \ + break; \ + case 2: \ + OP_VG2_PAIR( TA, TB, tcol, p_curr ); \ + break; \ + case 1: \ + OP_VG1_PAIR( TA, TB, tcol, p_curr ); \ + break; \ + default: \ + break; \ + } \ + P_BASE += ( 2 * SVL * SVL ); \ + } + +// 3. Logic for a Full Pair Tile +#define PROCESS_FULL_PAIR( TA, TB, P_BASE ) \ + { \ + double* p_curr = P_BASE; \ + for ( int tcol = 0; tcol < SVL; tcol += 4 ) \ + { \ + OP_VG4_PAIR( TA, TB, tcol, p_curr ); \ + } \ + P_BASE += ( 2 * SVL * SVL ); \ + } + +// PATH 2 + +// 1. Core Read, Shuffle & Store Logic for Groups +// Reads 4 columns from 4 Tiles (T0..T3) and stores interleaved +#define OP_VG4_GROUP( T0, T1, T2, T3, TCOL, P_PTR ) \ + { \ + svcount_t p_all = svptrue_c64(); \ + svfloat64x4_t z0 = svread_ver_za64_f64_vg4( T0, TCOL ); \ + svfloat64x4_t z1 = svread_ver_za64_f64_vg4( T1, TCOL ); \ + svfloat64x4_t z2 = svread_ver_za64_f64_vg4( T2, TCOL ); \ + svfloat64x4_t z3 = svread_ver_za64_f64_vg4( T3, TCOL ); \ + \ + /* Shuffle and Store Column 0 */ \ + svfloat64x4_t res0 = svcreate4( svget4( z0, 0 ), svget4( z1, 0 ), \ + svget4( z2, 0 ), svget4( z3, 0 ) ); \ + svst1( p_all, P_PTR, res0 ); \ + \ + /* Shuffle and Store Column 1 */ \ + svfloat64x4_t res1 = svcreate4( svget4( z0, 1 ), svget4( z1, 1 ), \ + svget4( z2, 1 ), svget4( z3, 1 ) ); \ + svst1( p_all, P_PTR + 4 * SVL, res1 ); \ + \ + /* Shuffle and Store Column 2 */ \ + svfloat64x4_t res2 = svcreate4( svget4( z0, 2 ), svget4( z1, 2 ), \ + svget4( z2, 2 ), svget4( z3, 2 ) ); \ + svst1( p_all, P_PTR + 8 * SVL, res2 ); \ + \ + /* Shuffle and Store Column 3 */ \ + svfloat64x4_t res3 = svcreate4( svget4( z0, 3 ), svget4( z1, 3 ), \ + svget4( z2, 3 ), svget4( z3, 3 ) ); \ + svst1( p_all, P_PTR + 12 * SVL, res3 ); \ + \ + P_PTR += ( 16 * SVL ); \ + } + +#define OP_VG2_GROUP( T0, T1, T2, T3, TCOL, P_PTR ) \ + { \ + svcount_t p_all = svptrue_c64(); \ + svfloat64x2_t z0 = svread_ver_za64_f64_vg2( T0, TCOL ); \ + svfloat64x2_t z1 = svread_ver_za64_f64_vg2( T1, TCOL ); \ + svfloat64x2_t z2 = svread_ver_za64_f64_vg2( T2, TCOL ); \ + svfloat64x2_t z3 = svread_ver_za64_f64_vg2( T3, TCOL ); \ + \ + /* Shuffle and Store Column 0 */ \ + svfloat64x4_t res0 = svcreate4( svget2( z0, 0 ), svget2( z1, 0 ), \ + svget2( z2, 0 ), svget2( z3, 0 ) ); \ + svst1( p_all, P_PTR, res0 ); \ + \ + /* Shuffle and Store Column 1 */ \ + svfloat64x4_t res1 = svcreate4( svget2( z0, 1 ), svget2( z1, 1 ), \ + svget2( z2, 1 ), svget2( z3, 1 ) ); \ + svst1( p_all, P_PTR + 4 * SVL, res1 ); \ + \ + P_PTR += ( 8 * SVL ); \ + } + +#define OP_VG1_GROUP( T0, T1, T2, T3, TCOL, P_PTR ) \ + { \ + svbool_t p_true = svptrue_b64(); \ + svcount_t p_cnt = svptrue_c64(); \ + \ + svfloat64_t z0 = svread_ver_za64_m( z0, p_true, T0, TCOL ); \ + svfloat64_t z1 = svread_ver_za64_m( z1, p_true, T1, TCOL ); \ + svfloat64_t z2 = svread_ver_za64_m( z2, p_true, T2, TCOL ); \ + svfloat64_t z3 = svread_ver_za64_m( z3, p_true, T3, TCOL ); \ + \ + /* Shuffle and Store */ \ + svfloat64x4_t res0 = svcreate4( z0, z1, z2, z3 ); \ + svst1( p_cnt, P_PTR, res0 ); \ + \ + P_PTR += ( 4 * SVL ); \ + } + +// 2. Logic for all group cases +#define PROCESS_GROUP( T0, T1, T2, T3, REM, P_BASE ) \ + { \ + int tcol = 0; \ + int local_rem = REM; \ + double* p_curr = P_BASE; \ + \ + while ( local_rem >= 4 ) \ + { \ + OP_VG4_GROUP( T0, T1, T2, T3, tcol, p_curr ); \ + tcol += 4; \ + local_rem -= 4; \ + } \ + if ( local_rem >= 2 ) \ + { \ + OP_VG2_GROUP( T0, T1, T2, T3, tcol, p_curr ); \ + tcol += 2; \ + local_rem -= 2; \ + } \ + if ( local_rem >= 1 ) \ + { \ + OP_VG1_GROUP( T0, T1, T2, T3, tcol, p_curr ); \ + } \ + } + +__arm_new( "za" ) __arm_locally_streaming void bli_dpackm_armsme_int_4SVLx2SVL + ( + conj_t conja, + pack_t schema, + dim_t cdim_, + dim_t cdim_max, + dim_t cdim_bcast, + dim_t n_, + dim_t n_max_, + const void *kappa, + const void *a, inc_t inca_, inc_t lda_, + void *p, inc_t ldp_, + const void *params, + const cntx_t * cntx + ) +{ + const int64_t cdim = cdim_; + const int64_t n = n_; + const int64_t inca = inca_; + const int64_t lda = lda_; + const int64_t ldp = ldp_; + + double* restrict a_ = (double*)a; + double* restrict p_ = (double*)p; + + uint64_t SVL = svcntsd(); + + svfloat64x4_t tmp; + svfloat64x2_t tmp2; + + const double* restrict alpha1 = a; + double* restrict pi1 = p; + + const bool gs = ( inca != 1 && lda != 1 ); + + if ( !gs && cdim_bcast ) + { + if ( bli_deq1( *( (double*)kappa ) ) ) + { + if ( inca == 1 && ldp == 4 * SVL ) + // continous memory.packA style + { + svbool_t p0 = svwhilelt_b64( (int64_t)0, cdim ); + svbool_t p1 = svwhilelt_b64( (int64_t)SVL, cdim ); + svbool_t p2 = svwhilelt_b64( (int64_t)( 2 * SVL ), cdim ); + svbool_t p3 = svwhilelt_b64( (int64_t)( 3 * SVL ), cdim ); + + for ( dim_t k = n; k != 0; --k ) + { + svfloat64_t z0 = svld1_f64( p0, alpha1 + 0 * SVL ); + svfloat64_t z1 = svld1_f64( p1, alpha1 + 1 * SVL ); + svfloat64_t z2 = svld1_f64( p2, alpha1 + 2 * SVL ); + svfloat64_t z3 = svld1_f64( p3, alpha1 + 3 * SVL ); + + tmp = svcreate4( z0, z1, z2, z3 ); + + svst1_f64_x4( svptrue_c64(), pi1, tmp ); + + alpha1 += lda; + pi1 += ldp; + } + } + if ( inca == 1 && ldp == 2 * SVL ) + // continous memory.packA style + { + svbool_t p0 = svwhilelt_b64( (int64_t)0, cdim ); + svbool_t p1 = svwhilelt_b64( (int64_t)SVL, cdim ); + for ( dim_t k = n; k != 0; --k ) + { + svfloat64_t z0 = svld1_f64( p0, alpha1 + 0 * SVL ); + svfloat64_t z1 = svld1_f64( p1, alpha1 + 1 * SVL ); + + tmp2 = svcreate2( z0, z1 ); + + svst1_f64_x2( svptrue_c64(), pi1, tmp2 ); + + alpha1 += lda; + pi1 += ldp; + } + } + else if ( inca != 1 && ldp == 2 * SVL ) + { + for ( uint64_t col = 0; col < n; col += 4 * SVL ) + { + int64_t valid_cols = n - col; + + // Determine total valid rows for this vertical block + // (max 2 * SVL) + int64_t valid_rows = ( cdim % ( 2 * SVL ) == 0 ) ? + ( 2 * SVL ) : + ( cdim % ( 2 * SVL ) ); + + // Generate the 4 standard SVE column predicates for the + // left-to-right f64 chunks + svbool_t pc0 = svwhilelt_b64( (int64_t)( 0 * SVL ), + valid_cols ); + svbool_t pc1 = svwhilelt_b64( (int64_t)( 1 * SVL ), + valid_cols ); + svbool_t pc2 = svwhilelt_b64( (int64_t)( 2 * SVL ), + valid_cols ); + svbool_t pc3 = svwhilelt_b64( (int64_t)( 3 * SVL ), + valid_cols ); + + svcount_t p_all = svptrue_c64(); + + if ( valid_cols >= 4 * SVL && valid_rows >= 2 * SVL ) + { + // FAST PATH: Perfect 2*SVL x 4*SVL block + for ( uint64_t trow = 0; trow < SVL; trow += 4 ) + { + const uint64_t tile_UL_corner = (trow)*inca + col; + const uint64_t tile_UR_corner = tile_UL_corner + + inca * SVL; + + // Group 1 (Tiles 4 through 7) + svfloat64x4_t zp0 = svld1_f64_x4( p_all, + &a_[tile_UL_corner + 0 * inca] ); + svfloat64x4_t zp1 = svld1_f64_x4( p_all, + &a_[tile_UL_corner + 1 * inca] ); + svfloat64x4_t zp2 = svld1_f64_x4( p_all, + &a_[tile_UL_corner + 2 * inca] ); + svfloat64x4_t zp3 = svld1_f64_x4( p_all, + &a_[tile_UL_corner + 3 * inca] ); + + svfloat64x4_t zq0 = svcreate4( svget4( zp0, 0 ), + svget4( zp1, 0 ), svget4( zp2, 0 ), + svget4( zp3, 0 ) ); + svfloat64x4_t zq1 = svcreate4( svget4( zp0, 1 ), + svget4( zp1, 1 ), svget4( zp2, 1 ), + svget4( zp3, 1 ) ); + svfloat64x4_t zq2 = svcreate4( svget4( zp0, 2 ), + svget4( zp1, 2 ), svget4( zp2, 2 ), + svget4( zp3, 2 ) ); + svfloat64x4_t zq3 = svcreate4( svget4( zp0, 3 ), + svget4( zp1, 3 ), svget4( zp2, 3 ), + svget4( zp3, 3 ) ); + + svwrite_hor_za64_f64_vg4( 4, trow, zq0 ); + svwrite_hor_za64_f64_vg4( 5, trow, zq1 ); + svwrite_hor_za64_f64_vg4( 6, trow, zq2 ); + svwrite_hor_za64_f64_vg4( 7, trow, zq3 ); + + // Group 2 (Tiles 0 through 3) + svfloat64x4_t zp4 = svld1_f64_x4( p_all, + &a_[tile_UR_corner + 0 * inca] ); + svfloat64x4_t zp5 = svld1_f64_x4( p_all, + &a_[tile_UR_corner + 1 * inca] ); + svfloat64x4_t zp6 = svld1_f64_x4( p_all, + &a_[tile_UR_corner + 2 * inca] ); + svfloat64x4_t zp7 = svld1_f64_x4( p_all, + &a_[tile_UR_corner + 3 * inca] ); + + svfloat64x4_t zq4 = svcreate4( svget4( zp4, 0 ), + svget4( zp5, 0 ), svget4( zp6, 0 ), + svget4( zp7, 0 ) ); + svfloat64x4_t zq5 = svcreate4( svget4( zp4, 1 ), + svget4( zp5, 1 ), svget4( zp6, 1 ), + svget4( zp7, 1 ) ); + svfloat64x4_t zq6 = svcreate4( svget4( zp4, 2 ), + svget4( zp5, 2 ), svget4( zp6, 2 ), + svget4( zp7, 2 ) ); + svfloat64x4_t zq7 = svcreate4( svget4( zp4, 3 ), + svget4( zp5, 3 ), svget4( zp6, 3 ), + svget4( zp7, 3 ) ); + + svwrite_hor_za64_f64_vg4( 0, trow, zq4 ); + svwrite_hor_za64_f64_vg4( 1, trow, zq5 ); + svwrite_hor_za64_f64_vg4( 2, trow, zq6 ); + svwrite_hor_za64_f64_vg4( 3, trow, zq7 ); + } + } + else + { + // SAFE PATH: Matrix edge + for ( uint64_t trow = 0; trow < SVL; trow += 4 ) + { + const uint64_t tile_UL_corner = (trow)*inca + col; + const uint64_t tile_UR_corner = tile_UL_corner + + inca * SVL; + + // 1. Create undefined default vectors + svfloat64_t undef_v = svundef_f64(); + svfloat64x4_t undef_x4 = svcreate4( undef_v, + undef_v, undef_v, undef_v ); + + // 2. Default all load arrays to empty to guarantee + // safety + svfloat64x4_t zp0 = undef_x4, zp1 = undef_x4, + zp2 = undef_x4, zp3 = undef_x4; + svfloat64x4_t zp4 = undef_x4, zp5 = undef_x4, + zp6 = undef_x4, zp7 = undef_x4; + + // 3. Calculate rows left independently for the top + // and bottom block + int64_t rows_left_top = valid_rows - trow; + int64_t rows_left_bot = valid_rows - ( SVL + trow ); + + // 4. Load top rows (writes to tiles 4, 5, 6, 7) + if ( rows_left_top > 0 ) + { + zp0 = svcreate4( svld1_f64( pc0, + &a_[tile_UL_corner + + 0 * inca + 0 * SVL] ), + svld1_f64( pc1, + &a_[tile_UL_corner + 0 * inca + + 1 * SVL] ), + svld1_f64( pc2, + &a_[tile_UL_corner + 0 * inca + + 2 * SVL] ), + svld1_f64( pc3, + &a_[tile_UL_corner + 0 * inca + + 3 * SVL] ) ); + } + if ( rows_left_top > 1 ) + { + zp1 = svcreate4( svld1_f64( pc0, + &a_[tile_UL_corner + + 1 * inca + 0 * SVL] ), + svld1_f64( pc1, + &a_[tile_UL_corner + 1 * inca + + 1 * SVL] ), + svld1_f64( pc2, + &a_[tile_UL_corner + 1 * inca + + 2 * SVL] ), + svld1_f64( pc3, + &a_[tile_UL_corner + 1 * inca + + 3 * SVL] ) ); + } + if ( rows_left_top > 2 ) + { + zp2 = svcreate4( svld1_f64( pc0, + &a_[tile_UL_corner + + 2 * inca + 0 * SVL] ), + svld1_f64( pc1, + &a_[tile_UL_corner + 2 * inca + + 1 * SVL] ), + svld1_f64( pc2, + &a_[tile_UL_corner + 2 * inca + + 2 * SVL] ), + svld1_f64( pc3, + &a_[tile_UL_corner + 2 * inca + + 3 * SVL] ) ); + } + if ( rows_left_top > 3 ) + { + zp3 = svcreate4( svld1_f64( pc0, + &a_[tile_UL_corner + + 3 * inca + 0 * SVL] ), + svld1_f64( pc1, + &a_[tile_UL_corner + 3 * inca + + 1 * SVL] ), + svld1_f64( pc2, + &a_[tile_UL_corner + 3 * inca + + 2 * SVL] ), + svld1_f64( pc3, + &a_[tile_UL_corner + 3 * inca + + 3 * SVL] ) ); + } + + // 5. Load bottom rows (writes to tiles 0, 1, 2, 3) + if ( rows_left_bot > 0 ) + { + zp4 = svcreate4( svld1_f64( pc0, + &a_[tile_UR_corner + + 0 * inca + 0 * SVL] ), + svld1_f64( pc1, + &a_[tile_UR_corner + 0 * inca + + 1 * SVL] ), + svld1_f64( pc2, + &a_[tile_UR_corner + 0 * inca + + 2 * SVL] ), + svld1_f64( pc3, + &a_[tile_UR_corner + 0 * inca + + 3 * SVL] ) ); + } + if ( rows_left_bot > 1 ) + { + zp5 = svcreate4( svld1_f64( pc0, + &a_[tile_UR_corner + + 1 * inca + 0 * SVL] ), + svld1_f64( pc1, + &a_[tile_UR_corner + 1 * inca + + 1 * SVL] ), + svld1_f64( pc2, + &a_[tile_UR_corner + 1 * inca + + 2 * SVL] ), + svld1_f64( pc3, + &a_[tile_UR_corner + 1 * inca + + 3 * SVL] ) ); + } + if ( rows_left_bot > 2 ) + { + zp6 = svcreate4( svld1_f64( pc0, + &a_[tile_UR_corner + + 2 * inca + 0 * SVL] ), + svld1_f64( pc1, + &a_[tile_UR_corner + 2 * inca + + 1 * SVL] ), + svld1_f64( pc2, + &a_[tile_UR_corner + 2 * inca + + 2 * SVL] ), + svld1_f64( pc3, + &a_[tile_UR_corner + 2 * inca + + 3 * SVL] ) ); + } + if ( rows_left_bot > 3 ) + { + zp7 = svcreate4( svld1_f64( pc0, + &a_[tile_UR_corner + + 3 * inca + 0 * SVL] ), + svld1_f64( pc1, + &a_[tile_UR_corner + 3 * inca + + 1 * SVL] ), + svld1_f64( pc2, + &a_[tile_UR_corner + 3 * inca + + 2 * SVL] ), + svld1_f64( pc3, + &a_[tile_UR_corner + 3 * inca + + 3 * SVL] ) ); + } + + // 6. Shuffle into x4 tuples + svfloat64x4_t zq0 = svcreate4( svget4( zp0, 0 ), + svget4( zp1, 0 ), svget4( zp2, 0 ), + svget4( zp3, 0 ) ); + svfloat64x4_t zq1 = svcreate4( svget4( zp0, 1 ), + svget4( zp1, 1 ), svget4( zp2, 1 ), + svget4( zp3, 1 ) ); + svfloat64x4_t zq2 = svcreate4( svget4( zp0, 2 ), + svget4( zp1, 2 ), svget4( zp2, 2 ), + svget4( zp3, 2 ) ); + svfloat64x4_t zq3 = svcreate4( svget4( zp0, 3 ), + svget4( zp1, 3 ), svget4( zp2, 3 ), + svget4( zp3, 3 ) ); + + svfloat64x4_t zq4 = svcreate4( svget4( zp4, 0 ), + svget4( zp5, 0 ), svget4( zp6, 0 ), + svget4( zp7, 0 ) ); + svfloat64x4_t zq5 = svcreate4( svget4( zp4, 1 ), + svget4( zp5, 1 ), svget4( zp6, 1 ), + svget4( zp7, 1 ) ); + svfloat64x4_t zq6 = svcreate4( svget4( zp4, 2 ), + svget4( zp5, 2 ), svget4( zp6, 2 ), + svget4( zp7, 2 ) ); + svfloat64x4_t zq7 = svcreate4( svget4( zp4, 3 ), + svget4( zp5, 3 ), svget4( zp6, 3 ), + svget4( zp7, 3 ) ); + + // 7. Write into ZA + svwrite_hor_za64_f64_vg4( 4, trow, zq0 ); + svwrite_hor_za64_f64_vg4( 5, trow, zq1 ); + svwrite_hor_za64_f64_vg4( 6, trow, zq2 ); + svwrite_hor_za64_f64_vg4( 7, trow, zq3 ); + + svwrite_hor_za64_f64_vg4( 0, trow, zq4 ); + svwrite_hor_za64_f64_vg4( 1, trow, zq5 ); + svwrite_hor_za64_f64_vg4( 2, trow, zq6 ); + svwrite_hor_za64_f64_vg4( 3, trow, zq7 ); + } + } + // Check if we are at the edge and fewer than + // 4 * SVL columns remain + if ( col + ( 4 * SVL ) > n ) + { + int total_rem = n - col; + + // --- PAIR 1: Tiles 0 and 4 --- + if ( total_rem >= (int)SVL ) + { + PROCESS_FULL_PAIR( 0, 4, p_ ); + total_rem -= SVL; + } + else + { + PROCESS_PARTIAL_PAIR( 0, 4, total_rem, p_ ); + total_rem = 0; + } + + // --- PAIR 2: Tiles 1 and 5 --- + if ( total_rem > 0 ) + { + if ( total_rem >= (int)SVL ) + { + PROCESS_FULL_PAIR( 1, 5, p_ ); + total_rem -= SVL; + } + else + { + PROCESS_PARTIAL_PAIR( 1, 5, total_rem, p_ ); + total_rem = 0; + } + } + + // --- PAIR 3: Tiles 2 and 6 --- + if ( total_rem > 0 ) + { + if ( total_rem >= (int)SVL ) + { + PROCESS_FULL_PAIR( 2, 6, p_ ); + total_rem -= SVL; + } + else + { + PROCESS_PARTIAL_PAIR( 2, 6, total_rem, p_ ); + total_rem = 0; + } + } + + // --- PAIR 4: Tiles 3 and 7 --- + if ( total_rem > 0 ) + { + PROCESS_PARTIAL_PAIR( 3, 7, total_rem, p_ ); + } + } + else + { + // Read - as - columns and store + for ( uint64_t tcol = 0; tcol < SVL; tcol += 4 ) + { + svcount_t p0 = svptrue_c32(); + + // Each svread_ver reads 4 columns of the tile(SVL). + svfloat64x4_t zq0 = svread_ver_za64_f64_vg4( + /* tile: */ 0, /* slice: */ tcol ); + svfloat64x4_t zq2 = svread_ver_za64_f64_vg4( + /* tile: */ 1, /* slice: */ tcol ); + + svfloat64x4_t zq1 = svread_ver_za64_f64_vg4( + /* tile: */ 4, /* slice: */ tcol ); + svfloat64x4_t zq3 = svread_ver_za64_f64_vg4( + /* tile: */ 5, /* slice: */ tcol ); + + svfloat64x4_t zq0_ = svcreate4( svget4( zq1, 0 ), + svget4( zq0, 0 ), svget4( zq1, 1 ), + svget4( zq0, 1 ) ); + + svfloat64x4_t zq1_ = svcreate4( svget4( zq1, 2 ), + svget4( zq0, 2 ), svget4( zq1, 3 ), + svget4( zq0, 3 ) ); + + svfloat64x4_t zq2_ = svcreate4( svget4( zq3, 0 ), + svget4( zq2, 0 ), svget4( zq3, 1 ), + svget4( zq2, 1 ) ); + + svfloat64x4_t zq3_ = svcreate4( svget4( zq3, 2 ), + svget4( zq2, 2 ), svget4( zq3, 3 ), + svget4( zq2, 3 ) ); + + svst1( p0, &p_[0], zq0_ ); + svst1( p0, &p_[4 * SVL], zq1_ ); + svst1( p0, &p_[2 * SVL * SVL], zq2_ ); + svst1( p0, &p_[2 * SVL * SVL + 4 * SVL], zq3_ ); + + // Each svread_ver reads 4 columns of the tile(SVL). + svfloat64x4_t zq4 = svread_ver_za64_f64_vg4( + /* tile: */ 2, /* slice: */ tcol ); + svfloat64x4_t zq5 = svread_ver_za64_f64_vg4( + /* tile: */ 6, /* slice: */ tcol ); + + svfloat64x4_t zq6 = svread_ver_za64_f64_vg4( + /* tile: */ 3, /* slice: */ tcol ); + svfloat64x4_t zq7 = svread_ver_za64_f64_vg4( + /* tile: */ 7, /* slice: */ tcol ); + + svfloat64x4_t zq4_ = svcreate4( svget4( zq5, 0 ), + svget4( zq4, 0 ), svget4( zq5, 1 ), + svget4( zq4, 1 ) ); + + svfloat64x4_t zq5_ = svcreate4( svget4( zq5, 2 ), + svget4( zq4, 2 ), svget4( zq5, 3 ), + svget4( zq4, 3 ) ); + + svfloat64x4_t zq6_ = svcreate4( svget4( zq7, 0 ), + svget4( zq6, 0 ), svget4( zq7, 1 ), + svget4( zq6, 1 ) ); + + svfloat64x4_t zq7_ = svcreate4( svget4( zq7, 2 ), + svget4( zq6, 2 ), svget4( zq7, 3 ), + svget4( zq6, 3 ) ); + + svst1( p0, &p_[4 * SVL * SVL], zq4_ ); + svst1( p0, &p_[4 * SVL * SVL + 4 * SVL], zq5_ ); + svst1( p0, &p_[6 * SVL * SVL], zq6_ ); + svst1( p0, &p_[6 * SVL * SVL + 4 * SVL], zq7_ ); + + p_ += ( 8 * SVL ); + } + p_ += ( 6 * SVL * SVL ); + } + } + + p_ = (double*)p; + } + else if ( inca != 1 && ldp == 4 * SVL ) + { + for ( uint64_t col = 0; col < n; col += 2 * SVL ) + { + int64_t valid_cols = n - col; + + // Determine total valid rows for this vertical block + // (max 4 * SVL) + int64_t valid_rows = ( cdim % ( 4 * SVL ) == 0 ) ? + ( 4 * SVL ) : + ( cdim % ( 4 * SVL ) ); + + // Generate the 2 standard SVE column predicates for the + // left and right f64 chunks + svbool_t pc0 = svwhilelt_b64( (int64_t)( 0 * SVL ), + valid_cols ); + svbool_t pc1 = svwhilelt_b64( (int64_t)( 1 * SVL ), + valid_cols ); + + svcount_t p_all = svptrue_c64(); + + if ( valid_cols >= 2 * SVL && valid_rows >= 4 * SVL ) + { + // FAST PATH: Perfect 4*SVL x 2*SVL block + for ( uint64_t trow = 0; trow < SVL; trow += 4 ) + { + const uint64_t tile_UL_corner = (trow)*inca + col; + const uint64_t tile_UR_corner = tile_UL_corner + + inca * SVL; + const uint64_t tile_BL_corner = tile_UL_corner + + inca * 2 * SVL; + const uint64_t tile_BR_corner = tile_UL_corner + + inca * 3 * SVL; + + // Group 1 (Tiles 0 & 4) + svfloat64x2_t zp0 = svld1_f64_x2( p_all, + &a_[tile_UL_corner + 0 * inca] ); + svfloat64x2_t zp1 = svld1_f64_x2( p_all, + &a_[tile_UL_corner + 1 * inca] ); + svfloat64x2_t zp2 = svld1_f64_x2( p_all, + &a_[tile_UL_corner + 2 * inca] ); + svfloat64x2_t zp3 = svld1_f64_x2( p_all, + &a_[tile_UL_corner + 3 * inca] ); + + // Group 2 (Tiles 1 & 5) + svfloat64x2_t zp4 = svld1_f64_x2( p_all, + &a_[tile_UR_corner + 0 * inca] ); + svfloat64x2_t zp5 = svld1_f64_x2( p_all, + &a_[tile_UR_corner + 1 * inca] ); + svfloat64x2_t zp6 = svld1_f64_x2( p_all, + &a_[tile_UR_corner + 2 * inca] ); + svfloat64x2_t zp7 = svld1_f64_x2( p_all, + &a_[tile_UR_corner + 3 * inca] ); + + svfloat64x4_t zq0 = svcreate4( svget2( zp0, 0 ), + svget2( zp1, 0 ), svget2( zp2, 0 ), + svget2( zp3, 0 ) ); + svfloat64x4_t zq1 = svcreate4( svget2( zp0, 1 ), + svget2( zp1, 1 ), svget2( zp2, 1 ), + svget2( zp3, 1 ) ); + svfloat64x4_t zq2 = svcreate4( svget2( zp4, 0 ), + svget2( zp5, 0 ), svget2( zp6, 0 ), + svget2( zp7, 0 ) ); + svfloat64x4_t zq3 = svcreate4( svget2( zp4, 1 ), + svget2( zp5, 1 ), svget2( zp6, 1 ), + svget2( zp7, 1 ) ); + + svwrite_hor_za64_f64_vg4( 0, trow, zq0 ); + svwrite_hor_za64_f64_vg4( 4, trow, zq1 ); + svwrite_hor_za64_f64_vg4( 1, trow, zq2 ); + svwrite_hor_za64_f64_vg4( 5, trow, zq3 ); + + // Group 3 (Tiles 2 & 6) + svfloat64x2_t zp8 = svld1_f64_x2( p_all, + &a_[tile_BL_corner + 0 * inca] ); + svfloat64x2_t zp9 = svld1_f64_x2( p_all, + &a_[tile_BL_corner + 1 * inca] ); + svfloat64x2_t zp10 = svld1_f64_x2( p_all, + &a_[tile_BL_corner + 2 * inca] ); + svfloat64x2_t zp11 = svld1_f64_x2( p_all, + &a_[tile_BL_corner + 3 * inca] ); + + // Group 4 (Tiles 3 & 7) + svfloat64x2_t zp12 = svld1_f64_x2( p_all, + &a_[tile_BR_corner + 0 * inca] ); + svfloat64x2_t zp13 = svld1_f64_x2( p_all, + &a_[tile_BR_corner + 1 * inca] ); + svfloat64x2_t zp14 = svld1_f64_x2( p_all, + &a_[tile_BR_corner + 2 * inca] ); + svfloat64x2_t zp15 = svld1_f64_x2( p_all, + &a_[tile_BR_corner + 3 * inca] ); + + svfloat64x4_t zq4 = svcreate4( svget2( zp8, 0 ), + svget2( zp9, 0 ), svget2( zp10, 0 ), + svget2( zp11, 0 ) ); + svfloat64x4_t zq5 = svcreate4( svget2( zp8, 1 ), + svget2( zp9, 1 ), svget2( zp10, 1 ), + svget2( zp11, 1 ) ); + svfloat64x4_t zq6 = svcreate4( svget2( zp12, 0 ), + svget2( zp13, 0 ), svget2( zp14, 0 ), + svget2( zp15, 0 ) ); + svfloat64x4_t zq7 = svcreate4( svget2( zp12, 1 ), + svget2( zp13, 1 ), svget2( zp14, 1 ), + svget2( zp15, 1 ) ); + + svwrite_hor_za64_f64_vg4( 2, trow, zq4 ); + svwrite_hor_za64_f64_vg4( 6, trow, zq5 ); + svwrite_hor_za64_f64_vg4( 3, trow, zq6 ); + svwrite_hor_za64_f64_vg4( 7, trow, zq7 ); + } + } + else + { + // SAFE PATH: Matrix edge + for ( uint64_t trow = 0; trow < SVL; trow += 4 ) + { + const uint64_t tile_UL_corner = (trow)*inca + col; + const uint64_t tile_UR_corner = tile_UL_corner + + inca * SVL; + const uint64_t tile_BL_corner = tile_UL_corner + + inca * 2 * SVL; + const uint64_t tile_BR_corner = tile_UL_corner + + inca * 3 * SVL; + + // 1. Create undefined default vectors + svfloat64_t undef_v = svundef_f64(); + svfloat64x2_t undef_x2 = svcreate2( undef_v, + undef_v ); + + // 2. Default all load arrays to empty to guarantee + // safety + svfloat64x2_t zp0 = undef_x2, zp1 = undef_x2, + zp2 = undef_x2, zp3 = undef_x2; + svfloat64x2_t zp4 = undef_x2, zp5 = undef_x2, + zp6 = undef_x2, zp7 = undef_x2; + svfloat64x2_t zp8 = undef_x2, zp9 = undef_x2, + zp10 = undef_x2, zp11 = undef_x2; + svfloat64x2_t zp12 = undef_x2, zp13 = undef_x2, + zp14 = undef_x2, zp15 = undef_x2; + + // 3. Calculate rows left independently for all 4 + // vertical groups + int64_t rows_left_0 = valid_rows - trow; + int64_t rows_left_1 = valid_rows - ( SVL + trow ); + int64_t rows_left_2 = valid_rows - + ( 2 * SVL + trow ); + int64_t rows_left_3 = valid_rows - + ( 3 * SVL + trow ); + + // 4. Load Group 1 (writes to tiles 0 and 4) + if ( rows_left_0 > 0 ) + zp0 = svcreate2( + svld1_f64( pc0, + &a_[tile_UL_corner + 0 * inca] ), + svld1_f64( pc1, + &a_[tile_UL_corner + 0 * inca + + SVL] ) ); + if ( rows_left_0 > 1 ) + zp1 = svcreate2( + svld1_f64( pc0, + &a_[tile_UL_corner + 1 * inca] ), + svld1_f64( pc1, + &a_[tile_UL_corner + 1 * inca + + SVL] ) ); + if ( rows_left_0 > 2 ) + zp2 = svcreate2( + svld1_f64( pc0, + &a_[tile_UL_corner + 2 * inca] ), + svld1_f64( pc1, + &a_[tile_UL_corner + 2 * inca + + SVL] ) ); + if ( rows_left_0 > 3 ) + zp3 = svcreate2( + svld1_f64( pc0, + &a_[tile_UL_corner + 3 * inca] ), + svld1_f64( pc1, + &a_[tile_UL_corner + 3 * inca + + SVL] ) ); + + // 5. Load Group 2 (writes to tiles 1 and 5) + if ( rows_left_1 > 0 ) + zp4 = svcreate2( + svld1_f64( pc0, + &a_[tile_UR_corner + 0 * inca] ), + svld1_f64( pc1, + &a_[tile_UR_corner + 0 * inca + + SVL] ) ); + if ( rows_left_1 > 1 ) + zp5 = svcreate2( + svld1_f64( pc0, + &a_[tile_UR_corner + 1 * inca] ), + svld1_f64( pc1, + &a_[tile_UR_corner + 1 * inca + + SVL] ) ); + if ( rows_left_1 > 2 ) + zp6 = svcreate2( + svld1_f64( pc0, + &a_[tile_UR_corner + 2 * inca] ), + svld1_f64( pc1, + &a_[tile_UR_corner + 2 * inca + + SVL] ) ); + if ( rows_left_1 > 3 ) + zp7 = svcreate2( + svld1_f64( pc0, + &a_[tile_UR_corner + 3 * inca] ), + svld1_f64( pc1, + &a_[tile_UR_corner + 3 * inca + + SVL] ) ); + + // 6. Load Group 3 (writes to tiles 2 and 6) + if ( rows_left_2 > 0 ) + zp8 = svcreate2( + svld1_f64( pc0, + &a_[tile_BL_corner + 0 * inca] ), + svld1_f64( pc1, + &a_[tile_BL_corner + 0 * inca + + SVL] ) ); + if ( rows_left_2 > 1 ) + zp9 = svcreate2( + svld1_f64( pc0, + &a_[tile_BL_corner + 1 * inca] ), + svld1_f64( pc1, + &a_[tile_BL_corner + 1 * inca + + SVL] ) ); + if ( rows_left_2 > 2 ) + zp10 = svcreate2( + svld1_f64( pc0, + &a_[tile_BL_corner + 2 * inca] ), + svld1_f64( pc1, + &a_[tile_BL_corner + 2 * inca + + SVL] ) ); + if ( rows_left_2 > 3 ) + zp11 = svcreate2( + svld1_f64( pc0, + &a_[tile_BL_corner + 3 * inca] ), + svld1_f64( pc1, + &a_[tile_BL_corner + 3 * inca + + SVL] ) ); + + // 7. Load Group 4 (writes to tiles 3 and 7) + if ( rows_left_3 > 0 ) + zp12 = svcreate2( + svld1_f64( pc0, + &a_[tile_BR_corner + 0 * inca] ), + svld1_f64( pc1, + &a_[tile_BR_corner + 0 * inca + + SVL] ) ); + if ( rows_left_3 > 1 ) + zp13 = svcreate2( + svld1_f64( pc0, + &a_[tile_BR_corner + 1 * inca] ), + svld1_f64( pc1, + &a_[tile_BR_corner + 1 * inca + + SVL] ) ); + if ( rows_left_3 > 2 ) + zp14 = svcreate2( + svld1_f64( pc0, + &a_[tile_BR_corner + 2 * inca] ), + svld1_f64( pc1, + &a_[tile_BR_corner + 2 * inca + + SVL] ) ); + if ( rows_left_3 > 3 ) + zp15 = svcreate2( + svld1_f64( pc0, + &a_[tile_BR_corner + 3 * inca] ), + svld1_f64( pc1, + &a_[tile_BR_corner + 3 * inca + + SVL] ) ); + + // 8. Shuffle into x4 tuples + svfloat64x4_t zq0 = svcreate4( svget2( zp0, 0 ), + svget2( zp1, 0 ), svget2( zp2, 0 ), + svget2( zp3, 0 ) ); + svfloat64x4_t zq1 = svcreate4( svget2( zp0, 1 ), + svget2( zp1, 1 ), svget2( zp2, 1 ), + svget2( zp3, 1 ) ); + svfloat64x4_t zq2 = svcreate4( svget2( zp4, 0 ), + svget2( zp5, 0 ), svget2( zp6, 0 ), + svget2( zp7, 0 ) ); + svfloat64x4_t zq3 = svcreate4( svget2( zp4, 1 ), + svget2( zp5, 1 ), svget2( zp6, 1 ), + svget2( zp7, 1 ) ); + + // 9. Write into ZA + svwrite_hor_za64_f64_vg4( 0, trow, zq0 ); + svwrite_hor_za64_f64_vg4( 4, trow, zq1 ); + svwrite_hor_za64_f64_vg4( 1, trow, zq2 ); + svwrite_hor_za64_f64_vg4( 5, trow, zq3 ); + + svfloat64x4_t zq4 = svcreate4( svget2( zp8, 0 ), + svget2( zp9, 0 ), svget2( zp10, 0 ), + svget2( zp11, 0 ) ); + svfloat64x4_t zq5 = svcreate4( svget2( zp8, 1 ), + svget2( zp9, 1 ), svget2( zp10, 1 ), + svget2( zp11, 1 ) ); + svfloat64x4_t zq6 = svcreate4( svget2( zp12, 0 ), + svget2( zp13, 0 ), svget2( zp14, 0 ), + svget2( zp15, 0 ) ); + svfloat64x4_t zq7 = svcreate4( svget2( zp12, 1 ), + svget2( zp13, 1 ), svget2( zp14, 1 ), + svget2( zp15, 1 ) ); + + svwrite_hor_za64_f64_vg4( 2, trow, zq4 ); + svwrite_hor_za64_f64_vg4( 6, trow, zq5 ); + svwrite_hor_za64_f64_vg4( 3, trow, zq6 ); + svwrite_hor_za64_f64_vg4( 7, trow, zq7 ); + } + } + // Check if we are at the edge where fewer than + // 2 * SVL columns remain + if ( col + ( 2 * SVL ) > n ) + { + int total_rem = n - col; + + // --- GROUP 1: Tiles 0, 1, 2, 3 --- + int rem_g1 = ( total_rem > (int)SVL ) ? (int)SVL : + total_rem; + + PROCESS_GROUP( 0, 1, 2, 3, rem_g1, &p_[0] ); + + // --- GROUP 2: Tiles 4, 5, 6, 7 --- + if ( total_rem > (int)SVL ) + { + int rem_g2 = total_rem - (int)SVL; + PROCESS_GROUP( 4, 5, 6, 7, rem_g2, + &p_[4 * SVL * SVL] ); + } + } + else + { + // Read - as - columns and store + for ( uint64_t tcol = 0; tcol < SVL; tcol += 4 ) + { + svcount_t p0 = svptrue_c32(); + + // Each svread_ver reads 4 columns of the tile(SVL). + svfloat64x4_t zq0 = svread_ver_za64_f64_vg4( + /* tile: */ 0, /* slice: */ tcol ); + svfloat64x4_t zq2 = svread_ver_za64_f64_vg4( + /* tile: */ 2, /* slice: */ tcol ); + + svfloat64x4_t zq1 = svread_ver_za64_f64_vg4( + /* tile: */ 1, /* slice: */ tcol ); + svfloat64x4_t zq3 = svread_ver_za64_f64_vg4( + /* tile: */ 3, /* slice: */ tcol ); + + svfloat64x4_t zq0_ = svcreate4( svget4( zq0, 0 ), + svget4( zq1, 0 ), svget4( zq2, 0 ), + svget4( zq3, 0 ) ); + + svfloat64x4_t zq1_ = svcreate4( svget4( zq0, 1 ), + svget4( zq1, 1 ), svget4( zq2, 1 ), + svget4( zq3, 1 ) ); + + svfloat64x4_t zq2_ = svcreate4( svget4( zq0, 2 ), + svget4( zq1, 2 ), svget4( zq2, 2 ), + svget4( zq3, 2 ) ); + + svfloat64x4_t zq3_ = svcreate4( svget4( zq0, 3 ), + svget4( zq1, 3 ), svget4( zq2, 3 ), + svget4( zq3, 3 ) ); + + svst1( p0, &p_[0], zq0_ ); + svst1( p0, &p_[4 * SVL], zq1_ ); + svst1( p0, &p_[8 * SVL], zq2_ ); + svst1( p0, &p_[12 * SVL], zq3_ ); + + // Each svread_ver reads 4 columns of the tile(SVL). + svfloat64x4_t zq4 = svread_ver_za64_f64_vg4( + /* tile: */ 4, /* slice: */ tcol ); + svfloat64x4_t zq5 = svread_ver_za64_f64_vg4( + /* tile: */ 5, /* slice: */ tcol ); + + svfloat64x4_t zq6 = svread_ver_za64_f64_vg4( + /* tile: */ 6, /* slice: */ tcol ); + svfloat64x4_t zq7 = svread_ver_za64_f64_vg4( + /* tile: */ 7, /* slice: */ tcol ); + + svfloat64x4_t zq4_ = svcreate4( svget4( zq4, 0 ), + svget4( zq5, 0 ), svget4( zq6, 0 ), + svget4( zq7, 0 ) ); + + svfloat64x4_t zq5_ = svcreate4( svget4( zq4, 1 ), + svget4( zq5, 1 ), svget4( zq6, 1 ), + svget4( zq7, 1 ) ); + + svfloat64x4_t zq6_ = svcreate4( svget4( zq4, 2 ), + svget4( zq5, 2 ), svget4( zq6, 2 ), + svget4( zq7, 2 ) ); + + svfloat64x4_t zq7_ = svcreate4( svget4( zq4, 3 ), + svget4( zq5, 3 ), svget4( zq6, 3 ), + svget4( zq7, 3 ) ); + + svst1( p0, &p_[4 * SVL * SVL], zq4_ ); + svst1( p0, &p_[4 * SVL * SVL + 4 * SVL], zq5_ ); + svst1( p0, &p_[4 * SVL * SVL + 8 * SVL], zq6_ ); + svst1( p0, &p_[4 * SVL * SVL + 12 * SVL], zq7_ ); + + p_ += ( 2 * SVL * SVL ); + } + p_ += ( 4 * SVL * SVL ); + } + } + + p_ = (double*)p; + } + } + else + { + bli_dscal2bbs_mxn + ( + conja, + cdim_, + n_, + kappa, + a, inca, lda, + p_, cdim_bcast, ldp + ); + } + } + else + { + bli_dscal2bbs_mxn + ( + conja, + cdim_, + n_, + kappa, + a, inca, lda, + p_, cdim_bcast, ldp + ); + } + + bli_dset0s_edge + ( + cdim_ * cdim_bcast, cdim_max * cdim_bcast, + n_, n_max_, + p_, ldp + ); +} diff --git a/kernels/armsme/1m/bli_packm_armsme_int_dSVLx8SVL.c b/kernels/armsme/1m/bli_packm_armsme_int_dSVLx8SVL.c new file mode 100644 index 000000000..9f940c4cf --- /dev/null +++ b/kernels/armsme/1m/bli_packm_armsme_int_dSVLx8SVL.c @@ -0,0 +1,1158 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, Linaro Limited + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include + +#include "blis.h" + +// MACROS FOR FALLTHROUGH LOGIC + +// PATH 1 + +// 1. Core Read & Store Logic +#define OP_VG4( TILE, TCOL, P_PTR ) \ + { \ + svcount_t p_all = svptrue_c64(); \ + svfloat64x4_t z = svread_ver_za64_f64_vg4( TILE, TCOL ); \ + svst1( p_all, P_PTR, z ); \ + P_PTR += ( 4 * SVL ); \ + } + +#define OP_VG2( TILE, TCOL, P_PTR ) \ + { \ + svcount_t p_all = svptrue_c64(); \ + svfloat64x2_t z = svread_ver_za64_f64_vg2( TILE, TCOL ); \ + svst1( p_all, P_PTR, z ); \ + P_PTR += ( 2 * SVL ); \ + } + +#define OP_VG1( TILE, TCOL, P_PTR ) \ + { \ + svbool_t p_true = svptrue_b64(); \ + svfloat64_t z = svread_ver_za64_m( z, p_true, TILE, TCOL ); \ + svst1( p_true, P_PTR, z ); \ + P_PTR += ( 1 * SVL ); \ + } + +// 2. Duff's Device Logic for a Partial Tile +#define PROCESS_PARTIAL_TILE( TILE_ID, REM, P_BASE ) \ + { \ + int tcol = 0; \ + double* p_curr = P_BASE; \ + int n4 = REM >> 2; /* Number of full VG4 blocks */ \ + \ + if ( n4 > 0 ) \ + { \ + switch ( n4 & 3 ) \ + { \ + case 0: \ + OP_VG4( TILE_ID, tcol, p_curr ); \ + tcol += 4; \ + case 3: \ + OP_VG4( TILE_ID, tcol, p_curr ); \ + tcol += 4; \ + case 2: \ + OP_VG4( TILE_ID, tcol, p_curr ); \ + tcol += 4; \ + case 1: \ + OP_VG4( TILE_ID, tcol, p_curr ); \ + tcol += 4; \ + } \ + } \ + \ + /* Handle remaining 1, 2, or 3 columns */ \ + switch ( REM & 3 ) \ + { \ + case 3: \ + OP_VG2( TILE_ID, tcol, p_curr ); \ + tcol += 2; \ + OP_VG1( TILE_ID, tcol, p_curr ); \ + break; \ + case 2: \ + OP_VG2( TILE_ID, tcol, p_curr ); \ + break; \ + case 1: \ + OP_VG1( TILE_ID, tcol, p_curr ); \ + break; \ + default: \ + break; \ + } \ + } + +// 3. Logic for a Full Tile +#define PROCESS_FULL_TILE( TILE_ID, P_BASE ) \ + { \ + double* p_curr = P_BASE; \ + for ( int tcol = 0; tcol < SVL; tcol += 4 ) \ + { \ + OP_VG4( TILE_ID, tcol, p_curr ); \ + } \ + } + +// PATH 2 + +// 1. Core Read, Shuffle & Store Logic +#define OP_SHUFFLED_VG4( TCOL, P_PTR ) \ + { \ + svcount_t p_all = svptrue_c64(); \ + svfloat64x4_t zq0 = svread_ver_za64_f64_vg4( 0, TCOL ); \ + svfloat64x4_t zq1 = svread_ver_za64_f64_vg4( 1, TCOL ); \ + svfloat64x4_t zq2 = svread_ver_za64_f64_vg4( 2, TCOL ); \ + svfloat64x4_t zq3 = svread_ver_za64_f64_vg4( 3, TCOL ); \ + svfloat64x4_t zq4 = svread_ver_za64_f64_vg4( 4, TCOL ); \ + svfloat64x4_t zq5 = svread_ver_za64_f64_vg4( 5, TCOL ); \ + svfloat64x4_t zq6 = svread_ver_za64_f64_vg4( 6, TCOL ); \ + svfloat64x4_t zq7 = svread_ver_za64_f64_vg4( 7, TCOL ); \ + \ + /* Shuffle and Store Column 0 (Offset 0) */ \ + svfloat64x4_t z_c0_top = svcreate4( svget4( zq0, 0 ), \ + svget4( zq1, 0 ), svget4( zq2, 0 ), svget4( zq3, 0 ) ); \ + svfloat64x4_t z_c0_bot = svcreate4( svget4( zq4, 0 ), \ + svget4( zq5, 0 ), svget4( zq6, 0 ), svget4( zq7, 0 ) ); \ + svst1( p_all, P_PTR, z_c0_top ); \ + svst1( p_all, P_PTR + 4 * SVL, z_c0_bot ); \ + \ + /* Shuffle and Store Column 1 (Offset 1*SVL*SVL) */ \ + svfloat64x4_t z_c1_top = svcreate4( svget4( zq0, 1 ), \ + svget4( zq1, 1 ), svget4( zq2, 1 ), svget4( zq3, 1 ) ); \ + svfloat64x4_t z_c1_bot = svcreate4( svget4( zq4, 1 ), \ + svget4( zq5, 1 ), svget4( zq6, 1 ), svget4( zq7, 1 ) ); \ + svst1( p_all, P_PTR + 1 * SVL * SVL, z_c1_top ); \ + svst1( p_all, P_PTR + 1 * SVL * SVL + 4 * SVL, z_c1_bot ); \ + \ + /* Shuffle and Store Column 2 (Offset 2*SVL*SVL) */ \ + svfloat64x4_t z_c2_top = svcreate4( svget4( zq0, 2 ), \ + svget4( zq1, 2 ), svget4( zq2, 2 ), svget4( zq3, 2 ) ); \ + svfloat64x4_t z_c2_bot = svcreate4( svget4( zq4, 2 ), \ + svget4( zq5, 2 ), svget4( zq6, 2 ), svget4( zq7, 2 ) ); \ + svst1( p_all, P_PTR + 2 * SVL * SVL, z_c2_top ); \ + svst1( p_all, P_PTR + 2 * SVL * SVL + 4 * SVL, z_c2_bot ); \ + \ + /* Shuffle and Store Column 3 (Offset 3*SVL*SVL) */ \ + svfloat64x4_t z_c3_top = svcreate4( svget4( zq0, 3 ), \ + svget4( zq1, 3 ), svget4( zq2, 3 ), svget4( zq3, 3 ) ); \ + svfloat64x4_t z_c3_bot = svcreate4( svget4( zq4, 3 ), \ + svget4( zq5, 3 ), svget4( zq6, 3 ), svget4( zq7, 3 ) ); \ + svst1( p_all, P_PTR + 3 * SVL * SVL, z_c3_top ); \ + svst1( p_all, P_PTR + 3 * SVL * SVL + 4 * SVL, z_c3_bot ); \ + \ + P_PTR += ( 4 * SVL * SVL ); \ + } + +#define OP_SHUFFLED_VG2( TCOL, P_PTR ) \ + { \ + svcount_t p_all = svptrue_c64(); \ + svfloat64x2_t zq0 = svread_ver_za64_f64_vg2( 0, TCOL ); \ + svfloat64x2_t zq1 = svread_ver_za64_f64_vg2( 1, TCOL ); \ + svfloat64x2_t zq2 = svread_ver_za64_f64_vg2( 2, TCOL ); \ + svfloat64x2_t zq3 = svread_ver_za64_f64_vg2( 3, TCOL ); \ + svfloat64x2_t zq4 = svread_ver_za64_f64_vg2( 4, TCOL ); \ + svfloat64x2_t zq5 = svread_ver_za64_f64_vg2( 5, TCOL ); \ + svfloat64x2_t zq6 = svread_ver_za64_f64_vg2( 6, TCOL ); \ + svfloat64x2_t zq7 = svread_ver_za64_f64_vg2( 7, TCOL ); \ + \ + svfloat64x4_t z_c0_top = svcreate4( svget2( zq0, 0 ), \ + svget2( zq1, 0 ), svget2( zq2, 0 ), svget2( zq3, 0 ) ); \ + svfloat64x4_t z_c0_bot = svcreate4( svget2( zq4, 0 ), \ + svget2( zq5, 0 ), svget2( zq6, 0 ), svget2( zq7, 0 ) ); \ + svst1( p_all, P_PTR, z_c0_top ); \ + svst1( p_all, P_PTR + 4 * SVL, z_c0_bot ); \ + \ + svfloat64x4_t z_c1_top = svcreate4( svget2( zq0, 1 ), \ + svget2( zq1, 1 ), svget2( zq2, 1 ), svget2( zq3, 1 ) ); \ + svfloat64x4_t z_c1_bot = svcreate4( svget2( zq4, 1 ), \ + svget2( zq5, 1 ), svget2( zq6, 1 ), svget2( zq7, 1 ) ); \ + svst1( p_all, P_PTR + 1 * SVL * SVL, z_c1_top ); \ + svst1( p_all, P_PTR + 1 * SVL * SVL + 4 * SVL, z_c1_bot ); \ + \ + P_PTR += ( 2 * SVL * SVL ); \ + } + +#define OP_SHUFFLED_VG1( TCOL, P_PTR ) \ + { \ + svbool_t p_true = svptrue_b64(); \ + svcount_t p_cnt = svptrue_c64(); \ + svfloat64_t ud = svundef_f64(); \ + svfloat64_t z0 = svread_ver_za64_m( ud, p_true, 0, TCOL ); \ + svfloat64_t z1 = svread_ver_za64_m( ud, p_true, 1, TCOL ); \ + svfloat64_t z2 = svread_ver_za64_m( ud, p_true, 2, TCOL ); \ + svfloat64_t z3 = svread_ver_za64_m( ud, p_true, 3, TCOL ); \ + svfloat64_t z4 = svread_ver_za64_m( ud, p_true, 4, TCOL ); \ + svfloat64_t z5 = svread_ver_za64_m( ud, p_true, 5, TCOL ); \ + svfloat64_t z6 = svread_ver_za64_m( ud, p_true, 6, TCOL ); \ + svfloat64_t z7 = svread_ver_za64_m( ud, p_true, 7, TCOL ); \ + \ + svfloat64x4_t z_top = svcreate4( z0, z1, z2, z3 ); \ + svfloat64x4_t z_bot = svcreate4( z4, z5, z6, z7 ); \ + svst1( p_cnt, P_PTR, z_top ); \ + svst1( p_cnt, P_PTR + 4 * SVL, z_bot ); \ + \ + P_PTR += ( 1 * SVL * SVL ); \ + } + +__arm_new( "za" ) __arm_locally_streaming void bli_dpackm_armsme_int_SVLx8SVL + ( + conj_t conja, + pack_t schema, + dim_t cdim_, + dim_t cdim_max, + dim_t cdim_bcast, + dim_t n_, + dim_t n_max_, + const void *kappa, + const void *a, inc_t inca_, inc_t lda_, + void *p, inc_t ldp_, + const void *params, + const cntx_t * cntx + ) +{ + const int64_t cdim = cdim_; + const int64_t n = n_; + const int64_t inca = inca_; + const int64_t lda = lda_; + const int64_t ldp = ldp_; + + double* restrict a_ = (double*)a; + double* restrict p_ = (double*)p; + + uint64_t SVL = svcntsd(); + + svfloat64x4_t tmp, tmp2; + svfloat64_t tmp3; + + const double* restrict alpha1 = a; + double* restrict pi1 = p; + + const bool gs = ( inca != 1 && lda != 1 ); + if ( !gs && cdim_bcast ) + { + if ( bli_deq1( *( (double*)kappa ) ) ) + { + if ( inca == 1 && ldp == 8 * SVL ) + // continous memory.packA style + { + svbool_t p0 = svwhilelt_b64( (int64_t)0, cdim ); + svbool_t p1 = svwhilelt_b64( (int64_t)( 1 * SVL ), cdim ); + svbool_t p2 = svwhilelt_b64( (int64_t)( 2 * SVL ), cdim ); + svbool_t p3 = svwhilelt_b64( (int64_t)( 3 * SVL ), cdim ); + svbool_t p4 = svwhilelt_b64( (int64_t)( 4 * SVL ), cdim ); + svbool_t p5 = svwhilelt_b64( (int64_t)( 5 * SVL ), cdim ); + svbool_t p6 = svwhilelt_b64( (int64_t)( 6 * SVL ), cdim ); + svbool_t p7 = svwhilelt_b64( (int64_t)( 7 * SVL ), cdim ); + + for ( dim_t k = n; k != 0; --k ) + { + svfloat64_t z0 = svld1_f64( p0, alpha1 + 0 * SVL ); + svfloat64_t z1 = svld1_f64( p1, alpha1 + 1 * SVL ); + svfloat64_t z2 = svld1_f64( p2, alpha1 + 2 * SVL ); + svfloat64_t z3 = svld1_f64( p3, alpha1 + 3 * SVL ); + svfloat64_t z4 = svld1_f64( p4, alpha1 + 4 * SVL ); + svfloat64_t z5 = svld1_f64( p5, alpha1 + 5 * SVL ); + svfloat64_t z6 = svld1_f64( p6, alpha1 + 6 * SVL ); + svfloat64_t z7 = svld1_f64( p7, alpha1 + 7 * SVL ); + + tmp = svcreate4( z0, z1, z2, z3 ); + tmp2 = svcreate4( z4, z5, z6, z7 ); + + svst1_f64_x4( svptrue_c64(), pi1, tmp ); + svst1_f64_x4( svptrue_c64(), pi1 + 4 * SVL, tmp2 ); + + alpha1 += lda; + pi1 += ldp; + } + } + if ( inca == 1 && ldp == SVL ) + // continous memory.packA style + { + svbool_t p0 = svwhilelt_b64( (int64_t)0, cdim ); + for ( dim_t k = n; k != 0; --k ) + { + tmp3 = svld1_f64( p0, alpha1 ); + svst1_f64( svptrue_b64(), pi1, tmp3 ); + + alpha1 += lda; + pi1 += ldp; + } + } + else if ( inca != 1 && ldp == SVL ) + { + for ( uint64_t col = 0; col < n; col += 8 * SVL ) + { + int64_t valid_cols = n - col; + + // Determine total valid rows for this vertical block + // (max SVL) + int64_t valid_rows = ( cdim % SVL == 0 ) ? SVL : + ( cdim % SVL ); + + // Generate the 8 standard SVE column predicates for the + // safe edge-case loads + svbool_t pc0 = svwhilelt_b64( (int64_t)( 0 * SVL ), + valid_cols ); + svbool_t pc1 = svwhilelt_b64( (int64_t)( 1 * SVL ), + valid_cols ); + svbool_t pc2 = svwhilelt_b64( (int64_t)( 2 * SVL ), + valid_cols ); + svbool_t pc3 = svwhilelt_b64( (int64_t)( 3 * SVL ), + valid_cols ); + svbool_t pc4 = svwhilelt_b64( (int64_t)( 4 * SVL ), + valid_cols ); + svbool_t pc5 = svwhilelt_b64( (int64_t)( 5 * SVL ), + valid_cols ); + svbool_t pc6 = svwhilelt_b64( (int64_t)( 6 * SVL ), + valid_cols ); + svbool_t pc7 = svwhilelt_b64( (int64_t)( 7 * SVL ), + valid_cols ); + + svcount_t p_all = svptrue_c64(); + + for ( uint64_t trow = 0; trow < SVL; trow += 4 ) + { + const uint64_t tile_UL_corner = (trow)*inca + col; + + // 1. Create undefined default vectors + svfloat64_t undef_v = svundef_f64(); + svfloat64x4_t undef_x4 = svcreate4( undef_v, undef_v, + undef_v, undef_v ); + + // 2. Default all load arrays to empty + svfloat64x4_t zp0 = undef_x4, zp1 = undef_x4, + zp2 = undef_x4, zp3 = undef_x4; + svfloat64x4_t zp4 = undef_x4, zp5 = undef_x4, + zp6 = undef_x4, zp7 = undef_x4; + + // 3. Calculate rows left for all tiles + int64_t rows_left = valid_rows - trow; + + // 4. Loads + if ( valid_cols >= 8 * SVL ) + { + // FAST PATH: All 8*SVL columns exist + if ( rows_left > 0 ) + { + zp0 = svld1_f64_x4( p_all, + &a_[tile_UL_corner + 0 * inca] ); + zp4 = svld1_f64_x4( p_all, + &a_[tile_UL_corner + 0 * inca + 4 * SVL] ); + } + if ( rows_left > 1 ) + { + zp1 = svld1_f64_x4( p_all, + &a_[tile_UL_corner + 1 * inca] ); + zp5 = svld1_f64_x4( p_all, + &a_[tile_UL_corner + 1 * inca + 4 * SVL] ); + } + if ( rows_left > 2 ) + { + zp2 = svld1_f64_x4( p_all, + &a_[tile_UL_corner + 2 * inca] ); + zp6 = svld1_f64_x4( p_all, + &a_[tile_UL_corner + 2 * inca + 4 * SVL] ); + } + if ( rows_left > 3 ) + { + zp3 = svld1_f64_x4( p_all, + &a_[tile_UL_corner + 3 * inca] ); + zp7 = svld1_f64_x4( p_all, + &a_[tile_UL_corner + 3 * inca + 4 * SVL] ); + } + } + else + { + // SAFE PATH: Matrix edge + if ( rows_left > 0 ) + { + zp0 = svcreate4( svld1_f64( pc0, + &a_[tile_UL_corner + + 0 * inca + 0 * SVL] ), + svld1_f64( pc1, + &a_[tile_UL_corner + 0 * inca + + 1 * SVL] ), + svld1_f64( pc2, + &a_[tile_UL_corner + 0 * inca + + 2 * SVL] ), + svld1_f64( pc3, + &a_[tile_UL_corner + 0 * inca + + 3 * SVL] ) ); + zp4 = svcreate4( svld1_f64( pc4, + &a_[tile_UL_corner + + 0 * inca + 4 * SVL] ), + svld1_f64( pc5, + &a_[tile_UL_corner + 0 * inca + + 5 * SVL] ), + svld1_f64( pc6, + &a_[tile_UL_corner + 0 * inca + + 6 * SVL] ), + svld1_f64( pc7, + &a_[tile_UL_corner + 0 * inca + + 7 * SVL] ) ); + } + if ( rows_left > 1 ) + { + zp1 = svcreate4( svld1_f64( pc0, + &a_[tile_UL_corner + + 1 * inca + 0 * SVL] ), + svld1_f64( pc1, + &a_[tile_UL_corner + 1 * inca + + 1 * SVL] ), + svld1_f64( pc2, + &a_[tile_UL_corner + 1 * inca + + 2 * SVL] ), + svld1_f64( pc3, + &a_[tile_UL_corner + 1 * inca + + 3 * SVL] ) ); + zp5 = svcreate4( svld1_f64( pc4, + &a_[tile_UL_corner + + 1 * inca + 4 * SVL] ), + svld1_f64( pc5, + &a_[tile_UL_corner + 1 * inca + + 5 * SVL] ), + svld1_f64( pc6, + &a_[tile_UL_corner + 1 * inca + + 6 * SVL] ), + svld1_f64( pc7, + &a_[tile_UL_corner + 1 * inca + + 7 * SVL] ) ); + } + if ( rows_left > 2 ) + { + zp2 = svcreate4( svld1_f64( pc0, + &a_[tile_UL_corner + + 2 * inca + 0 * SVL] ), + svld1_f64( pc1, + &a_[tile_UL_corner + 2 * inca + + 1 * SVL] ), + svld1_f64( pc2, + &a_[tile_UL_corner + 2 * inca + + 2 * SVL] ), + svld1_f64( pc3, + &a_[tile_UL_corner + 2 * inca + + 3 * SVL] ) ); + zp6 = svcreate4( svld1_f64( pc4, + &a_[tile_UL_corner + + 2 * inca + 4 * SVL] ), + svld1_f64( pc5, + &a_[tile_UL_corner + 2 * inca + + 5 * SVL] ), + svld1_f64( pc6, + &a_[tile_UL_corner + 2 * inca + + 6 * SVL] ), + svld1_f64( pc7, + &a_[tile_UL_corner + 2 * inca + + 7 * SVL] ) ); + } + if ( rows_left > 3 ) + { + zp3 = svcreate4( svld1_f64( pc0, + &a_[tile_UL_corner + + 3 * inca + 0 * SVL] ), + svld1_f64( pc1, + &a_[tile_UL_corner + 3 * inca + + 1 * SVL] ), + svld1_f64( pc2, + &a_[tile_UL_corner + 3 * inca + + 2 * SVL] ), + svld1_f64( pc3, + &a_[tile_UL_corner + 3 * inca + + 3 * SVL] ) ); + zp7 = svcreate4( svld1_f64( pc4, + &a_[tile_UL_corner + + 3 * inca + 4 * SVL] ), + svld1_f64( pc5, + &a_[tile_UL_corner + 3 * inca + + 5 * SVL] ), + svld1_f64( pc6, + &a_[tile_UL_corner + 3 * inca + + 6 * SVL] ), + svld1_f64( pc7, + &a_[tile_UL_corner + 3 * inca + + 7 * SVL] ) ); + } + } + + // 5. Shuffle into x4 tuples + svfloat64x4_t zq0 = svcreate4( svget4( zp0, 0 ), + svget4( zp1, 0 ), svget4( zp2, 0 ), + svget4( zp3, 0 ) ); + svfloat64x4_t zq1 = svcreate4( svget4( zp0, 1 ), + svget4( zp1, 1 ), svget4( zp2, 1 ), + svget4( zp3, 1 ) ); + svfloat64x4_t zq2 = svcreate4( svget4( zp0, 2 ), + svget4( zp1, 2 ), svget4( zp2, 2 ), + svget4( zp3, 2 ) ); + svfloat64x4_t zq3 = svcreate4( svget4( zp0, 3 ), + svget4( zp1, 3 ), svget4( zp2, 3 ), + svget4( zp3, 3 ) ); + + svfloat64x4_t zq4 = svcreate4( svget4( zp4, 0 ), + svget4( zp5, 0 ), svget4( zp6, 0 ), + svget4( zp7, 0 ) ); + svfloat64x4_t zq5 = svcreate4( svget4( zp4, 1 ), + svget4( zp5, 1 ), svget4( zp6, 1 ), + svget4( zp7, 1 ) ); + svfloat64x4_t zq6 = svcreate4( svget4( zp4, 2 ), + svget4( zp5, 2 ), svget4( zp6, 2 ), + svget4( zp7, 2 ) ); + svfloat64x4_t zq7 = svcreate4( svget4( zp4, 3 ), + svget4( zp5, 3 ), svget4( zp6, 3 ), + svget4( zp7, 3 ) ); + + // 6. Write into ZA + svwrite_hor_za64_f64_vg4( 0, trow, zq0 ); + svwrite_hor_za64_f64_vg4( 1, trow, zq1 ); + svwrite_hor_za64_f64_vg4( 2, trow, zq2 ); + svwrite_hor_za64_f64_vg4( 3, trow, zq3 ); + svwrite_hor_za64_f64_vg4( 4, trow, zq4 ); + svwrite_hor_za64_f64_vg4( 5, trow, zq5 ); + svwrite_hor_za64_f64_vg4( 6, trow, zq6 ); + svwrite_hor_za64_f64_vg4( 7, trow, zq7 ); + } + // Check if we are at the edge and fewer than + // 8 * SVL columns remain + if ( col + ( 8 * SVL ) > n ) + { + int total_rem = n - col; + + // --- TILE 0 --- + if ( total_rem >= (int)SVL ) + { + PROCESS_FULL_TILE( 0, &p_[0] ); + total_rem -= SVL; + } + else + { + PROCESS_PARTIAL_TILE( 0, total_rem, &p_[0] ); + total_rem = 0; + } + + // --- TILE 1 --- + if ( total_rem > 0 ) + { + if ( total_rem >= (int)SVL ) + { + PROCESS_FULL_TILE( 1, &p_[SVL * SVL] ); + total_rem -= SVL; + } + else + { + PROCESS_PARTIAL_TILE( 1, total_rem, + &p_[SVL * SVL] ); + total_rem = 0; + } + } + + // --- TILE 2 --- + if ( total_rem > 0 ) + { + if ( total_rem >= (int)SVL ) + { + PROCESS_FULL_TILE( 2, &p_[2 * SVL * SVL] ); + total_rem -= SVL; + } + else + { + PROCESS_PARTIAL_TILE( 2, total_rem, + &p_[2 * SVL * SVL] ); + total_rem = 0; + } + } + + // --- TILE 3 --- + if ( total_rem > 0 ) + { + if ( total_rem >= (int)SVL ) + { + PROCESS_FULL_TILE( 3, &p_[3 * SVL * SVL] ); + total_rem -= SVL; + } + else + { + PROCESS_PARTIAL_TILE( 3, total_rem, + &p_[3 * SVL * SVL] ); + total_rem = 0; + } + } + // --- TILE 4 --- + if ( total_rem > 0 ) + { + if ( total_rem >= (int)SVL ) + { + PROCESS_FULL_TILE( 4, &p_[4 * SVL * SVL] ); + total_rem -= SVL; + } + else + { + PROCESS_PARTIAL_TILE( 4, total_rem, + &p_[4 * SVL * SVL] ); + total_rem = 0; + } + } + + // --- TILE 5 --- + if ( total_rem > 0 ) + { + if ( total_rem >= (int)SVL ) + { + PROCESS_FULL_TILE( 5, &p_[5 * SVL * SVL] ); + total_rem -= SVL; + } + else + { + PROCESS_PARTIAL_TILE( 5, total_rem, + &p_[5 * SVL * SVL] ); + total_rem = 0; + } + } + + // --- TILE 6 --- + if ( total_rem > 0 ) + { + if ( total_rem >= (int)SVL ) + { + PROCESS_FULL_TILE( 6, &p_[6 * SVL * SVL] ); + total_rem -= SVL; + } + else + { + PROCESS_PARTIAL_TILE( 6, total_rem, + &p_[6 * SVL * SVL] ); + total_rem = 0; + } + } + + // --- TILE 7 --- + if ( total_rem > 0 ) + { + PROCESS_PARTIAL_TILE( 7, total_rem, + &p_[7 * SVL * SVL] ); + } + } + + else + { + // Read - as - columns and store + for ( uint64_t tcol = 0; tcol < SVL; tcol += 4 ) + { + svcount_t p0 = svptrue_c32(); + + // Each svread_ver reads 4 columns of the tile(SVL). + svfloat64x4_t zq0 = svread_ver_za64_f64_vg4( + /* tile: */ 0, /* slice: */ tcol ); + svfloat64x4_t zq2 = svread_ver_za64_f64_vg4( + /* tile: */ 2, /* slice: */ tcol ); + + svfloat64x4_t zq1 = svread_ver_za64_f64_vg4( + /* tile: */ 1, /* slice: */ tcol ); + svfloat64x4_t zq3 = svread_ver_za64_f64_vg4( + /* tile: */ 3, /* slice: */ tcol ); + + svst1( p0, &p_[0], zq0 ); + svst1( p0, &p_[SVL * SVL], zq1 ); + svst1( p0, &p_[2 * SVL * SVL], zq2 ); + svst1( p0, &p_[3 * SVL * SVL], zq3 ); + + // Each svread_ver reads 4 columns of the tile(SVL). + svfloat64x4_t zq4 = svread_ver_za64_f64_vg4( + /* tile: */ 4, /* slice: */ tcol ); + svfloat64x4_t zq5 = svread_ver_za64_f64_vg4( + /* tile: */ 5, /* slice: */ tcol ); + + svfloat64x4_t zq6 = svread_ver_za64_f64_vg4( + /* tile: */ 6, /* slice: */ tcol ); + svfloat64x4_t zq7 = svread_ver_za64_f64_vg4( + /* tile: */ 7, /* slice: */ tcol ); + + svst1( p0, &p_[4 * SVL * SVL], zq4 ); + svst1( p0, &p_[5 * SVL * SVL], zq5 ); + svst1( p0, &p_[6 * SVL * SVL], zq6 ); + svst1( p0, &p_[7 * SVL * SVL], zq7 ); + + p_ += ( 4 * SVL ); + } + p_ += ( 7 * SVL * SVL ); + } + } + + p_ = (double*)p; + } + else if ( inca != 1 && ldp == 8 * SVL ) + { + for ( uint64_t col = 0; col < n; col += SVL ) + { + int64_t valid_cols = n - col; + + // Determine total valid rows for this vertical block + // (max 8 * SVL) + int64_t valid_rows = ( cdim % ( 8 * SVL ) == 0 ) ? + ( 8 * SVL ) : + ( cdim % ( 8 * SVL ) ); + + // Generate a standard SVE column predicate for the safe + // edge-case loads + svbool_t p_col = svwhilelt_b64( (int64_t)0, valid_cols ); + svbool_t p_all = svptrue_b64(); + + if ( valid_cols >= SVL && valid_rows >= 8 * SVL ) + { + // FAST PATH: Perfect 8*SVL x SVL block + for ( uint64_t trow = 0; trow < SVL; trow += 4 ) + { + const uint64_t tile_0 = (trow)*inca + col; + const uint64_t tile_1 = tile_0 + inca * SVL; + const uint64_t tile_2 = tile_0 + inca * 2 * SVL; + const uint64_t tile_3 = tile_0 + inca * 3 * SVL; + const uint64_t tile_4 = tile_0 + inca * 4 * SVL; + const uint64_t tile_5 = tile_0 + inca * 5 * SVL; + const uint64_t tile_6 = tile_0 + inca * 6 * SVL; + const uint64_t tile_7 = tile_0 + inca * 7 * SVL; + + svfloat64x4_t zq0 = svcreate4( + svld1_f64( p_all, &a_[tile_0 + 0 * inca] ), + svld1_f64( p_all, &a_[tile_0 + 1 * inca] ), + svld1_f64( p_all, &a_[tile_0 + 2 * inca] ), + svld1_f64( p_all, &a_[tile_0 + 3 * inca] ) ); + + svfloat64x4_t zq1 = svcreate4( + svld1_f64( p_all, &a_[tile_1 + 0 * inca] ), + svld1_f64( p_all, &a_[tile_1 + 1 * inca] ), + svld1_f64( p_all, &a_[tile_1 + 2 * inca] ), + svld1_f64( p_all, &a_[tile_1 + 3 * inca] ) ); + + svfloat64x4_t zq2 = svcreate4( + svld1_f64( p_all, &a_[tile_2 + 0 * inca] ), + svld1_f64( p_all, &a_[tile_2 + 1 * inca] ), + svld1_f64( p_all, &a_[tile_2 + 2 * inca] ), + svld1_f64( p_all, &a_[tile_2 + 3 * inca] ) ); + + svfloat64x4_t zq3 = svcreate4( + svld1_f64( p_all, &a_[tile_3 + 0 * inca] ), + svld1_f64( p_all, &a_[tile_3 + 1 * inca] ), + svld1_f64( p_all, &a_[tile_3 + 2 * inca] ), + svld1_f64( p_all, &a_[tile_3 + 3 * inca] ) ); + + svwrite_hor_za64_f64_vg4( 0, trow, zq0 ); + svwrite_hor_za64_f64_vg4( 1, trow, zq1 ); + svwrite_hor_za64_f64_vg4( 2, trow, zq2 ); + svwrite_hor_za64_f64_vg4( 3, trow, zq3 ); + + svfloat64x4_t zq4 = svcreate4( + svld1_f64( p_all, &a_[tile_4 + 0 * inca] ), + svld1_f64( p_all, &a_[tile_4 + 1 * inca] ), + svld1_f64( p_all, &a_[tile_4 + 2 * inca] ), + svld1_f64( p_all, &a_[tile_4 + 3 * inca] ) ); + + svfloat64x4_t zq5 = svcreate4( + svld1_f64( p_all, &a_[tile_5 + 0 * inca] ), + svld1_f64( p_all, &a_[tile_5 + 1 * inca] ), + svld1_f64( p_all, &a_[tile_5 + 2 * inca] ), + svld1_f64( p_all, &a_[tile_5 + 3 * inca] ) ); + + svfloat64x4_t zq6 = svcreate4( + svld1_f64( p_all, &a_[tile_6 + 0 * inca] ), + svld1_f64( p_all, &a_[tile_6 + 1 * inca] ), + svld1_f64( p_all, &a_[tile_6 + 2 * inca] ), + svld1_f64( p_all, &a_[tile_6 + 3 * inca] ) ); + + svfloat64x4_t zq7 = svcreate4( + svld1_f64( p_all, &a_[tile_7 + 0 * inca] ), + svld1_f64( p_all, &a_[tile_7 + 1 * inca] ), + svld1_f64( p_all, &a_[tile_7 + 2 * inca] ), + svld1_f64( p_all, &a_[tile_7 + 3 * inca] ) ); + + svwrite_hor_za64_f64_vg4( 4, trow, zq4 ); + svwrite_hor_za64_f64_vg4( 5, trow, zq5 ); + svwrite_hor_za64_f64_vg4( 6, trow, zq6 ); + svwrite_hor_za64_f64_vg4( 7, trow, zq7 ); + } + } + else + { + // SAFE PATH: Matrix edge + for ( uint64_t trow = 0; trow < SVL; trow += 4 ) + { + // 1. Create undefined default vectors + svfloat64_t undef_v = svundef_f64(); + svfloat64x4_t undef_x4 = svcreate4( undef_v, + undef_v, undef_v, undef_v ); + + // 2. Default all load arrays to empty + svfloat64x4_t zq0 = undef_x4, zq1 = undef_x4, + zq2 = undef_x4, zq3 = undef_x4; + svfloat64x4_t zq4 = undef_x4, zq5 = undef_x4, + zq6 = undef_x4, zq7 = undef_x4; + + const uint64_t tile_0 = (trow)*inca + col; + const uint64_t tile_1 = tile_0 + inca * SVL; + const uint64_t tile_2 = tile_0 + inca * 2 * SVL; + const uint64_t tile_3 = tile_0 + inca * 3 * SVL; + const uint64_t tile_4 = tile_0 + inca * 4 * SVL; + const uint64_t tile_5 = tile_0 + inca * 5 * SVL; + const uint64_t tile_6 = tile_0 + inca * 6 * SVL; + const uint64_t tile_7 = tile_0 + inca * 7 * SVL; + + // 3. Calculate rows left independently for each + // tile + int64_t rows_left_t0 = valid_rows - + ( 0 * SVL + trow ); + int64_t rows_left_t1 = valid_rows - + ( 1 * SVL + trow ); + int64_t rows_left_t2 = valid_rows - + ( 2 * SVL + trow ); + int64_t rows_left_t3 = valid_rows - + ( 3 * SVL + trow ); + int64_t rows_left_t4 = valid_rows - + ( 4 * SVL + trow ); + int64_t rows_left_t5 = valid_rows - + ( 5 * SVL + trow ); + int64_t rows_left_t6 = valid_rows - + ( 6 * SVL + trow ); + int64_t rows_left_t7 = valid_rows - + ( 7 * SVL + trow ); + + // 4. Loads for each tile + if ( rows_left_t0 > 0 ) + { + zq0 = svcreate4( ( rows_left_t0 > 0 ) ? + svld1_f64( p_col, + &a_[tile_0 + 0 * inca] ) : + undef_v, + ( rows_left_t0 > 1 ) ? + svld1_f64( p_col, + &a_[tile_0 + 1 * inca] ) : + undef_v, + ( rows_left_t0 > 2 ) ? + svld1_f64( p_col, + &a_[tile_0 + 2 * inca] ) : + undef_v, + ( rows_left_t0 > 3 ) ? + svld1_f64( p_col, + &a_[tile_0 + 3 * inca] ) : + undef_v ); + } + + if ( rows_left_t1 > 0 ) + { + zq1 = svcreate4( ( rows_left_t1 > 0 ) ? + svld1_f64( p_col, + &a_[tile_1 + 0 * inca] ) : + undef_v, + ( rows_left_t1 > 1 ) ? + svld1_f64( p_col, + &a_[tile_1 + 1 * inca] ) : + undef_v, + ( rows_left_t1 > 2 ) ? + svld1_f64( p_col, + &a_[tile_1 + 2 * inca] ) : + undef_v, + ( rows_left_t1 > 3 ) ? + svld1_f64( p_col, + &a_[tile_1 + 3 * inca] ) : + undef_v ); + } + + if ( rows_left_t2 > 0 ) + { + zq2 = svcreate4( ( rows_left_t2 > 0 ) ? + svld1_f64( p_col, + &a_[tile_2 + 0 * inca] ) : + undef_v, + ( rows_left_t2 > 1 ) ? + svld1_f64( p_col, + &a_[tile_2 + 1 * inca] ) : + undef_v, + ( rows_left_t2 > 2 ) ? + svld1_f64( p_col, + &a_[tile_2 + 2 * inca] ) : + undef_v, + ( rows_left_t2 > 3 ) ? + svld1_f64( p_col, + &a_[tile_2 + 3 * inca] ) : + undef_v ); + } + + if ( rows_left_t3 > 0 ) + { + zq3 = svcreate4( ( rows_left_t3 > 0 ) ? + svld1_f64( p_col, + &a_[tile_3 + 0 * inca] ) : + undef_v, + ( rows_left_t3 > 1 ) ? + svld1_f64( p_col, + &a_[tile_3 + 1 * inca] ) : + undef_v, + ( rows_left_t3 > 2 ) ? + svld1_f64( p_col, + &a_[tile_3 + 2 * inca] ) : + undef_v, + ( rows_left_t3 > 3 ) ? + svld1_f64( p_col, + &a_[tile_3 + 3 * inca] ) : + undef_v ); + } + + if ( rows_left_t4 > 0 ) + { + zq4 = svcreate4( ( rows_left_t4 > 0 ) ? + svld1_f64( p_col, + &a_[tile_4 + 0 * inca] ) : + undef_v, + ( rows_left_t4 > 1 ) ? + svld1_f64( p_col, + &a_[tile_4 + 1 * inca] ) : + undef_v, + ( rows_left_t4 > 2 ) ? + svld1_f64( p_col, + &a_[tile_4 + 2 * inca] ) : + undef_v, + ( rows_left_t4 > 3 ) ? + svld1_f64( p_col, + &a_[tile_4 + 3 * inca] ) : + undef_v ); + } + + if ( rows_left_t5 > 0 ) + { + zq5 = svcreate4( ( rows_left_t5 > 0 ) ? + svld1_f64( p_col, + &a_[tile_5 + 0 * inca] ) : + undef_v, + ( rows_left_t5 > 1 ) ? + svld1_f64( p_col, + &a_[tile_5 + 1 * inca] ) : + undef_v, + ( rows_left_t5 > 2 ) ? + svld1_f64( p_col, + &a_[tile_5 + 2 * inca] ) : + undef_v, + ( rows_left_t5 > 3 ) ? + svld1_f64( p_col, + &a_[tile_5 + 3 * inca] ) : + undef_v ); + } + + if ( rows_left_t6 > 0 ) + { + zq6 = svcreate4( ( rows_left_t6 > 0 ) ? + svld1_f64( p_col, + &a_[tile_6 + 0 * inca] ) : + undef_v, + ( rows_left_t6 > 1 ) ? + svld1_f64( p_col, + &a_[tile_6 + 1 * inca] ) : + undef_v, + ( rows_left_t6 > 2 ) ? + svld1_f64( p_col, + &a_[tile_6 + 2 * inca] ) : + undef_v, + ( rows_left_t6 > 3 ) ? + svld1_f64( p_col, + &a_[tile_6 + 3 * inca] ) : + undef_v ); + } + + if ( rows_left_t7 > 0 ) + { + zq7 = svcreate4( ( rows_left_t7 > 0 ) ? + svld1_f64( p_col, + &a_[tile_7 + 0 * inca] ) : + undef_v, + ( rows_left_t7 > 1 ) ? + svld1_f64( p_col, + &a_[tile_7 + 1 * inca] ) : + undef_v, + ( rows_left_t7 > 2 ) ? + svld1_f64( p_col, + &a_[tile_7 + 2 * inca] ) : + undef_v, + ( rows_left_t7 > 3 ) ? + svld1_f64( p_col, + &a_[tile_7 + 3 * inca] ) : + undef_v ); + } + + // 5. Write into ZA + svwrite_hor_za64_f64_vg4( 0, trow, zq0 ); + svwrite_hor_za64_f64_vg4( 1, trow, zq1 ); + svwrite_hor_za64_f64_vg4( 2, trow, zq2 ); + svwrite_hor_za64_f64_vg4( 3, trow, zq3 ); + svwrite_hor_za64_f64_vg4( 4, trow, zq4 ); + svwrite_hor_za64_f64_vg4( 5, trow, zq5 ); + svwrite_hor_za64_f64_vg4( 6, trow, zq6 ); + svwrite_hor_za64_f64_vg4( 7, trow, zq7 ); + } + } + + // Check if we are at the edge where fewer than + // SVL columns remain + if ( col + SVL > n ) + { + int rem = n - col; + int tcol = 0; + + // 1. Process as many full VG4 blocks as possible + while ( rem >= 4 ) + { + OP_SHUFFLED_VG4( tcol, p_ ); + tcol += 4; + rem -= 4; + } + + // 2. Process a VG2 block if remaining + if ( rem >= 2 ) + { + OP_SHUFFLED_VG2( tcol, p_ ); + tcol += 2; + rem -= 2; + } + + // 3. Process the last column if remaining + if ( rem >= 1 ) + { + OP_SHUFFLED_VG1( tcol, p_ ); + } + } + else + { + // Read - as - columns and store + for ( uint64_t tcol = 0; tcol < SVL; tcol += 4 ) + { + svcount_t p0 = svptrue_c32(); + + // Each svread_ver reads 4 columns of the tile(SVL). + svfloat64x4_t zq0 = svread_ver_za64_f64_vg4( + /* tile: */ 0, /* slice: */ tcol ); + svfloat64x4_t zq2 = svread_ver_za64_f64_vg4( + /* tile: */ 2, /* slice: */ tcol ); + + svfloat64x4_t zq1 = svread_ver_za64_f64_vg4( + /* tile: */ 1, /* slice: */ tcol ); + svfloat64x4_t zq3 = svread_ver_za64_f64_vg4( + /* tile: */ 3, /* slice: */ tcol ); + + svfloat64x4_t zq0_ = svcreate4( svget4( zq0, 0 ), + svget4( zq1, 0 ), svget4( zq2, 0 ), + svget4( zq3, 0 ) ); + + svfloat64x4_t zq1_ = svcreate4( svget4( zq0, 1 ), + svget4( zq1, 1 ), svget4( zq2, 1 ), + svget4( zq3, 1 ) ); + + svfloat64x4_t zq2_ = svcreate4( svget4( zq0, 2 ), + svget4( zq1, 2 ), svget4( zq2, 2 ), + svget4( zq3, 2 ) ); + + svfloat64x4_t zq3_ = svcreate4( svget4( zq0, 3 ), + svget4( zq1, 3 ), svget4( zq2, 3 ), + svget4( zq3, 3 ) ); + + svst1( p0, &p_[0], zq0_ ); + svst1( p0, &p_[SVL * SVL], zq1_ ); + svst1( p0, &p_[2 * SVL * SVL], zq2_ ); + svst1( p0, &p_[3 * SVL * SVL], zq3_ ); + + // Each svread_ver reads 4 columns of the tile(SVL). + svfloat64x4_t zq4 = svread_ver_za64_f64_vg4( + /* tile: */ 4, /* slice: */ tcol ); + svfloat64x4_t zq5 = svread_ver_za64_f64_vg4( + /* tile: */ 5, /* slice: */ tcol ); + + svfloat64x4_t zq6 = svread_ver_za64_f64_vg4( + /* tile: */ 6, /* slice: */ tcol ); + svfloat64x4_t zq7 = svread_ver_za64_f64_vg4( + /* tile: */ 7, /* slice: */ tcol ); + + svfloat64x4_t zq4_ = svcreate4( svget4( zq4, 0 ), + svget4( zq5, 0 ), svget4( zq6, 0 ), + svget4( zq7, 0 ) ); + + svfloat64x4_t zq5_ = svcreate4( svget4( zq4, 1 ), + svget4( zq5, 1 ), svget4( zq6, 1 ), + svget4( zq7, 1 ) ); + + svfloat64x4_t zq6_ = svcreate4( svget4( zq4, 2 ), + svget4( zq5, 2 ), svget4( zq6, 2 ), + svget4( zq7, 2 ) ); + + svfloat64x4_t zq7_ = svcreate4( svget4( zq4, 3 ), + svget4( zq5, 3 ), svget4( zq6, 3 ), + svget4( zq7, 3 ) ); + + svst1( p0, &p_[4 * SVL], zq4_ ); + svst1( p0, &p_[SVL * SVL + 4 * SVL], zq5_ ); + svst1( p0, &p_[2 * SVL * SVL + 4 * SVL], zq6_ ); + svst1( p0, &p_[3 * SVL * SVL + 4 * SVL], zq7_ ); + + p_ += ( 4 * SVL * SVL ); + } + } + } + + p_ = (double*)p; + } + } + else + { + bli_dscal2bbs_mxn + ( + conja, + cdim_, + n_, + kappa, + a, inca, lda, + p_, cdim_bcast, ldp + ); + } + } + else + { + bli_dscal2bbs_mxn + ( + conja, + cdim_, + n_, + kappa, + a, inca, lda, + p_, cdim_bcast, ldp + ); + } + + bli_dset0s_edge + ( + cdim_ * cdim_bcast, cdim_max * cdim_bcast, + n_, n_max_, + p_, ldp + ); +} diff --git a/kernels/armsme/1m/bli_packm_armsme_int_s2SVLx2SVL.c b/kernels/armsme/1m/bli_packm_armsme_int_s2SVLx2SVL.c new file mode 100644 index 000000000..e4cf5551d --- /dev/null +++ b/kernels/armsme/1m/bli_packm_armsme_int_s2SVLx2SVL.c @@ -0,0 +1,578 @@ +/* + * + * BLIS An object-based framework for developing high-performance BLAS-like + * libraries. + * + * Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, + * Linaro Limited + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. - + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. - Neither the + * name(s) of the copyright holder(s) nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include + +#include "blis.h" + +// MACROS FOR FALLTHROUGH LOGIC + +// 1. Core Read & Shuffle Logic +#define READ_AND_SHUFFLE_VG4_1( tcol, zq0_, zq2_ ) \ + svfloat32x4_t zq0 = svread_ver_za32_f32_vg4( 0, tcol ); \ + svfloat32x4_t zq2 = svread_ver_za32_f32_vg4( 2, tcol ); \ + zq0_ = svcreate4( svget4( zq0, 0 ), svget4( zq2, 0 ), svget4( zq0, 1 ), \ + svget4( zq2, 1 ) ); \ + zq2_ = svcreate4( svget4( zq0, 2 ), svget4( zq2, 2 ), svget4( zq0, 3 ), \ + svget4( zq2, 3 ) ); + +#define READ_AND_SHUFFLE_VG4_2( tcol, zq1_, zq3_ ) \ + svfloat32x4_t zq1 = svread_ver_za32_f32_vg4( 1, tcol ); \ + svfloat32x4_t zq3 = svread_ver_za32_f32_vg4( 3, tcol ); \ + zq1_ = svcreate4( svget4( zq1, 0 ), svget4( zq3, 0 ), svget4( zq1, 1 ), \ + svget4( zq3, 1 ) ); \ + zq3_ = svcreate4( svget4( zq1, 2 ), svget4( zq3, 2 ), svget4( zq1, 3 ), \ + svget4( zq3, 3 ) ); + +#define READ_AND_SHUFFLE_VG2_1( tcol, zq0_ ) \ + svfloat32x2_t zq0 = svread_ver_za32_f32_vg2( 0, tcol ); \ + svfloat32x2_t zq2 = svread_ver_za32_f32_vg2( 2, tcol ); \ + zq0_ = svcreate4( svget2( zq0, 0 ), svget2( zq2, 0 ), svget2( zq0, 1 ), \ + svget2( zq2, 1 ) ); + +#define READ_AND_SHUFFLE_VG2_2( tcol, zq1_ ) \ + svfloat32x2_t zq1 = svread_ver_za32_f32_vg2( 1, tcol ); \ + svfloat32x2_t zq3 = svread_ver_za32_f32_vg2( 3, tcol ); \ + zq1_ = svcreate4( svget2( zq1, 0 ), svget2( zq3, 0 ), svget2( zq1, 1 ), \ + svget2( zq3, 1 ) ); + +#define READ_AND_SHUFFLE_1( tcol, zq0_ ) \ + svfloat32_t zq0 = svread_ver_za32_m( zq0, svptrue_b32(), 0, tcol ); \ + svfloat32_t zq2 = svread_ver_za32_m( zq2, svptrue_b32(), 2, tcol ); \ + zq0_ = svcreate2( zq0, zq2 ); + +#define READ_AND_SHUFFLE_2( tcol, zq1_ ) \ + svfloat32_t zq1 = svread_ver_za32_m( zq1, svptrue_b32(), 1, tcol ); \ + svfloat32_t zq3 = svread_ver_za32_m( zq3, svptrue_b32(), 3, tcol ); \ + zq1_ = svcreate2( zq1, zq3 ); + +// 2. Execution Blocks combined with storing + +// [FULL] Stores 8 Vectors +#define OP_VG4_1( tcol, p_ ) \ + { \ + svcount_t p0 = svptrue_c32(); \ + svfloat32x4_t z0, z1; \ + READ_AND_SHUFFLE_VG4_1( tcol, z0, z1 ); \ + svst1( p0, &p_[0], z0 ); \ + svst1( p0, &p_[4 * SVL], z1 ); \ + p_ += ( 8 * SVL ); \ + } + +#define OP_VG4_2( tcol, p_ ) \ + { \ + svcount_t p0 = svptrue_c32(); \ + svfloat32x4_t z0, z1; \ + READ_AND_SHUFFLE_VG4_2( tcol, z0, z1 ); \ + svst1( p0, &p_[0], z0 ); \ + svst1( p0, &p_[4 * SVL], z1 ); \ + p_ += ( 8 * SVL ); \ + } + +// [TAIL VG2] Stores 4 Vectors +#define OP_TAIL_VG2_1( tcol, p_ ) \ + { \ + svcount_t p0 = svptrue_c32(); \ + svfloat32x4_t z0; \ + READ_AND_SHUFFLE_VG2_1( tcol, z0 ); \ + svst1( p0, &p_[0], z0 ); \ + p_ += ( 4 * SVL ); \ + } + +#define OP_TAIL_VG2_2( tcol, p_ ) \ + { \ + svcount_t p0 = svptrue_c32(); \ + svfloat32x4_t z0; \ + READ_AND_SHUFFLE_VG2_2( tcol, z0 ); \ + svst1( p0, &p_[0], z0 ); \ + p_ += ( 4 * SVL ); \ + } + +// [TAIL] Stores 2 Vectors +#define OP_TAIL_1( tcol, p_ ) \ + { \ + svcount_t p0 = svptrue_c32(); \ + svfloat32x2_t z0; \ + READ_AND_SHUFFLE_1( tcol, z0 ); \ + svst1( p0, &p_[0], z0 ); \ + p_ += ( 2 * SVL ); \ + } + +#define OP_TAIL_2( tcol, p_ ) \ + { \ + svcount_t p0 = svptrue_c32(); \ + svfloat32x2_t z0; \ + READ_AND_SHUFFLE_2( tcol, z0 ); \ + svst1( p0, &p_[0], z0 ); \ + p_ += ( 2 * SVL ); \ + } + +__arm_new( "za" ) __arm_locally_streaming void bli_spackm_armsme_int_2SVLx2SVL + ( + conj_t conja, + pack_t schema, + dim_t cdim_, + dim_t cdim_max, + dim_t cdim_bcast, + dim_t n_, + dim_t n_max_, + const void *kappa, + const void *a, inc_t inca_, inc_t lda_, + void *p, inc_t ldp_, + const void *params, + const cntx_t * cntx + ) +{ + const int64_t cdim = cdim_; + const int64_t n = n_; + const int64_t inca = inca_; + const int64_t lda = lda_; + const int64_t ldp = ldp_; + + float* restrict a_ = (float*)a; + float* restrict p_ = (float*)p; + + uint64_t SVL = svcntsw(); + + svfloat32x2_t tmp; + + const float* restrict alpha1 = a; + float* restrict pi1 = p; + + const bool gs = ( inca != 1 && lda != 1 ); + + if ( cdim_bcast == 1 && !gs ) + { + if ( bli_seq1( *( (float*)kappa ) ) ) + { + if ( inca == 1 ) + // continous memory.packA style + { + svbool_t p0 = svwhilelt_b32( (int64_t)0, cdim ); + svbool_t p1 = svwhilelt_b32( (int64_t)SVL, cdim ); + + for ( dim_t k = n; k != 0; --k ) + { + svfloat32_t z0 = svld1_f32( p0, alpha1 + 0 * SVL ); + svfloat32_t z1 = svld1_f32( p1, alpha1 + 1 * SVL ); + + tmp = svcreate2( z0, z1 ); + + svst1_f32_x2( svptrue_c32(), pi1, tmp ); + + alpha1 += lda; + pi1 += ldp; + } + } + else + { + { + for ( uint64_t col = 0; col < n; col += 2 * SVL ) + { + int64_t valid_cols = n - col; + + // Determine total valid rows for this vertical block + // (max 2 * SVL) + int64_t valid_rows = ( cdim % ( 2 * SVL ) == 0 ) ? + ( 2 * SVL ) : + ( cdim % ( 2 * SVL ) ); + + // Generate the 2 standard SVE column predicates for the + // pairs of left and right tiles + svbool_t pc0 = svwhilelt_b32( (int64_t)( 0 * SVL ), + valid_cols ); + svbool_t pc1 = svwhilelt_b32( (int64_t)( 1 * SVL ), + valid_cols ); + + svcount_t p_all = svptrue_c32(); + + if ( valid_cols >= 2 * SVL && valid_rows >= 2 * SVL ) + { + // FAST PATH: Perfect 2*SVL x 2*SVL block + for ( uint64_t trow = 0; trow < SVL; trow += 4 ) + { + const uint64_t tile_UL_corner = (trow)*inca + + col; + + // Group 1 (Tiles 0 and 1) + svfloat32x2_t zp0 = svld1_f32_x2( p_all, + &a_[tile_UL_corner + 0 * inca] ); + svfloat32x2_t zp1 = svld1_f32_x2( p_all, + &a_[tile_UL_corner + 1 * inca] ); + svfloat32x2_t zp2 = svld1_f32_x2( p_all, + &a_[tile_UL_corner + 2 * inca] ); + svfloat32x2_t zp3 = svld1_f32_x2( p_all, + &a_[tile_UL_corner + 3 * inca] ); + + const uint64_t tile_BL_corner = tile_UL_corner + + inca * SVL; + + // Group 1 (Tiles 2 and 3) + svfloat32x2_t zp4 = svld1_f32_x2( p_all, + &a_[tile_BL_corner + 0 * inca] ); + svfloat32x2_t zp5 = svld1_f32_x2( p_all, + &a_[tile_BL_corner + 1 * inca] ); + svfloat32x2_t zp6 = svld1_f32_x2( p_all, + &a_[tile_BL_corner + 2 * inca] ); + svfloat32x2_t zp7 = svld1_f32_x2( p_all, + &a_[tile_BL_corner + 3 * inca] ); + + // Shuffle into x4 tuples + svfloat32x4_t zq0 = svcreate4( svget2( zp0, 0 ), + svget2( zp1, 0 ), svget2( zp2, 0 ), + svget2( zp3, 0 ) ); + svfloat32x4_t zq1 = svcreate4( svget2( zp0, 1 ), + svget2( zp1, 1 ), svget2( zp2, 1 ), + svget2( zp3, 1 ) ); + svfloat32x4_t zq2 = svcreate4( svget2( zp4, 0 ), + svget2( zp5, 0 ), svget2( zp6, 0 ), + svget2( zp7, 0 ) ); + svfloat32x4_t zq3 = svcreate4( svget2( zp4, 1 ), + svget2( zp5, 1 ), svget2( zp6, 1 ), + svget2( zp7, 1 ) ); + + // ZA writes + svwrite_hor_za32_f32_vg4( 0, trow, zq0 ); + svwrite_hor_za32_f32_vg4( 1, trow, zq1 ); + svwrite_hor_za32_f32_vg4( 2, trow, zq2 ); + svwrite_hor_za32_f32_vg4( 3, trow, zq3 ); + } + } + else + { + // SAFE PATH: Matrix edge + for ( uint64_t trow = 0; trow < SVL; trow += 4 ) + { + const uint64_t tile_UL_corner = (trow)*inca + + col; + const uint64_t tile_BL_corner = tile_UL_corner + + inca * SVL; + + // 1. Create undefined default vectors + svfloat32_t undef_v = svundef_f32(); + svfloat32x2_t undef_x2 = svcreate2( undef_v, + undef_v ); + + // 2. Default all load arrays to empty + svfloat32x2_t zp0 = undef_x2, zp1 = undef_x2, + zp2 = undef_x2, zp3 = undef_x2; + svfloat32x2_t zp4 = undef_x2, zp5 = undef_x2, + zp6 = undef_x2, zp7 = undef_x2; + + // 3. Calculate rows left independently for the + // top and bottom block + int64_t rows_left_top = valid_rows - trow; + int64_t rows_left_bot = valid_rows - + ( SVL + trow ); + + // 4. Load top rows (writes to tiles 0,1) + if ( rows_left_top > 0 ) + zp0 = svcreate2( + svld1_f32( pc0, + &a_[tile_UL_corner + 0 * inca] ), + svld1_f32( pc1, + &a_[tile_UL_corner + 0 * inca + + SVL] ) ); + if ( rows_left_top > 1 ) + zp1 = svcreate2( + svld1_f32( pc0, + &a_[tile_UL_corner + 1 * inca] ), + svld1_f32( pc1, + &a_[tile_UL_corner + 1 * inca + + SVL] ) ); + if ( rows_left_top > 2 ) + zp2 = svcreate2( + svld1_f32( pc0, + &a_[tile_UL_corner + 2 * inca] ), + svld1_f32( pc1, + &a_[tile_UL_corner + 2 * inca + + SVL] ) ); + if ( rows_left_top > 3 ) + zp3 = svcreate2( + svld1_f32( pc0, + &a_[tile_UL_corner + 3 * inca] ), + svld1_f32( pc1, + &a_[tile_UL_corner + 3 * inca + + SVL] ) ); + + // 5. Load bottom rows (writes to tiles 2, 3) + if ( rows_left_bot > 0 ) + zp4 = svcreate2( + svld1_f32( pc0, + &a_[tile_BL_corner + 0 * inca] ), + svld1_f32( pc1, + &a_[tile_BL_corner + 0 * inca + + SVL] ) ); + if ( rows_left_bot > 1 ) + zp5 = svcreate2( + svld1_f32( pc0, + &a_[tile_BL_corner + 1 * inca] ), + svld1_f32( pc1, + &a_[tile_BL_corner + 1 * inca + + SVL] ) ); + if ( rows_left_bot > 2 ) + zp6 = svcreate2( + svld1_f32( pc0, + &a_[tile_BL_corner + 2 * inca] ), + svld1_f32( pc1, + &a_[tile_BL_corner + 2 * inca + + SVL] ) ); + if ( rows_left_bot > 3 ) + zp7 = svcreate2( + svld1_f32( pc0, + &a_[tile_BL_corner + 3 * inca] ), + svld1_f32( pc1, + &a_[tile_BL_corner + 3 * inca + + SVL] ) ); + + // 6. Shuffle into x4 tuples + svfloat32x4_t zq0 = svcreate4( svget2( zp0, 0 ), + svget2( zp1, 0 ), svget2( zp2, 0 ), + svget2( zp3, 0 ) ); + svfloat32x4_t zq1 = svcreate4( svget2( zp0, 1 ), + svget2( zp1, 1 ), svget2( zp2, 1 ), + svget2( zp3, 1 ) ); + svfloat32x4_t zq2 = svcreate4( svget2( zp4, 0 ), + svget2( zp5, 0 ), svget2( zp6, 0 ), + svget2( zp7, 0 ) ); + svfloat32x4_t zq3 = svcreate4( svget2( zp4, 1 ), + svget2( zp5, 1 ), svget2( zp6, 1 ), + svget2( zp7, 1 ) ); + + // 7. Write into ZA + svwrite_hor_za32_f32_vg4( 0, trow, zq0 ); + svwrite_hor_za32_f32_vg4( 1, trow, zq1 ); + svwrite_hor_za32_f32_vg4( 2, trow, zq2 ); + svwrite_hor_za32_f32_vg4( 3, trow, zq3 ); + } + } + // Check if we are at the edge and fewer than + // 2 * SVL columns remain + if ( col + ( 2 * SVL ) > n ) + { + // Total columns left to process in this tail. + // Range: [1, 2*SVL - 1] + int total_rem = n - col; + + // Split total_rem into columns for Tile Pair 0/2 + // (rem1) and 1/3 (rem2) Each vertical tile pair has + // a width of SVL. + int rem1 = ( total_rem > (int)SVL ) ? (int)SVL : + total_rem; + int rem2 = ( total_rem > (int)SVL ) ? + ( total_rem - (int)SVL ) : + 0; + + // PART 1: Process Tiles 0 & 2 + if ( rem1 > 0 ) + { + int tcol = 0; + int n4 = rem1 >> 2; + + if ( n4 > 0 ) + { + int i = ( n4 + 3 ) >> 2; + // Duff's Device unrolling VG4 operations + switch ( n4 & 3 ) + { + case 0: + do + { + OP_VG4_1( tcol, p_ ); + tcol += 4; + case 3: + OP_VG4_1( tcol, p_ ); + tcol += 4; + case 2: + OP_VG4_1( tcol, p_ ); + tcol += 4; + case 1: + OP_VG4_1( tcol, p_ ); + tcol += 4; + } while ( --i > 0 ); + } + } + + // Handle remaining 1, 2, or 3 columns + switch ( rem1 & 3 ) + { + case 3: + OP_TAIL_VG2_1( tcol, p_ ); + tcol += 2; + OP_TAIL_1( tcol, p_ ); + break; + case 2: + OP_TAIL_VG2_1( tcol, p_ ); + break; + case 1: + OP_TAIL_1( tcol, p_ ); + break; + default: + break; + } + } + + // PART 2: Process Tiles 1 & 3 + if ( rem2 > 0 ) + { + int tcol = 0; + int n4 = rem2 >> 2; + + if ( n4 > 0 ) + { + int i = ( n4 + 3 ) >> 2; + // Duff's Device unrolling VG4 operations + switch ( n4 & 3 ) + { + case 0: + do + { + OP_VG4_2( tcol, p_ ); + tcol += 4; + case 3: + OP_VG4_2( tcol, p_ ); + tcol += 4; + case 2: + OP_VG4_2( tcol, p_ ); + tcol += 4; + case 1: + OP_VG4_2( tcol, p_ ); + tcol += 4; + } while ( --i > 0 ); + } + } + + // Handle remaining 1, 2, or 3 columns + switch ( rem2 & 3 ) + { + case 3: + OP_TAIL_VG2_2( tcol, p_ ); + tcol += 2; + OP_TAIL_2( tcol, p_ ); + break; + case 2: + OP_TAIL_VG2_2( tcol, p_ ); + break; + case 1: + OP_TAIL_2( tcol, p_ ); + break; + default: + break; + } + } + } + + else + { + // Read - as - columns and store + for ( uint64_t tcol = 0; tcol < SVL; tcol += 4 ) + { + svcount_t p0 = svptrue_c32(); + + // Each svread_ver reads 4 columns of the + // tile(SVL). + svfloat32x4_t zq0 = svread_ver_za32_f32_vg4( + /* tile: */ 0, /* slice: */ tcol ); + svfloat32x4_t zq2 = svread_ver_za32_f32_vg4( + /* tile: */ 2, /* slice: */ tcol ); + + svfloat32x4_t zq1 = svread_ver_za32_f32_vg4( + /* tile: */ 1, /* slice: */ tcol ); + svfloat32x4_t zq3 = svread_ver_za32_f32_vg4( + /* tile: */ 3, /* slice: */ tcol ); + + svfloat32x4_t zq0_ = svcreate4( + svget4( zq0, 0 ), svget4( zq2, 0 ), + svget4( zq0, 1 ), svget4( zq2, 1 ) ); + + svfloat32x4_t zq1_ = svcreate4( + svget4( zq0, 2 ), svget4( zq2, 2 ), + svget4( zq0, 3 ), svget4( zq2, 3 ) ); + + svfloat32x4_t zq2_ = svcreate4( + svget4( zq1, 0 ), svget4( zq3, 0 ), + svget4( zq1, 1 ), svget4( zq3, 1 ) ); + + svfloat32x4_t zq3_ = svcreate4( + svget4( zq1, 2 ), svget4( zq3, 2 ), + svget4( zq1, 3 ), svget4( zq3, 3 ) ); + + svst1( p0, &p_[0], zq0_ ); + svst1( p0, &p_[4 * SVL], zq1_ ); + svst1( p0, &p_[2 * SVL * SVL], zq2_ ); + svst1( p0, &p_[2 * SVL * SVL + 4 * SVL], zq3_ ); + p_ += ( 8 * SVL ); + } + p_ += ( 2 * SVL * SVL ); + } + } + } + + p_ = (float*)p; + } + } + else + { + bli_sscal2bbs_mxn + ( + conja, + cdim_, + n_, + kappa, + a, inca, lda, + p_, cdim_bcast, ldp + ); + + } + } + else + { + bli_sscal2bbs_mxn + ( + conja, + cdim_, + n_, + kappa, + a, inca, lda, + p_, cdim_bcast, ldp + ); + } + + bli_sset0s_edge + ( + cdim_ * cdim_bcast, cdim_max * cdim_bcast, + n_, n_max_, + p_, ldp + ); +} diff --git a/kernels/armsme/1m/bli_packm_armsme_int_sSVLx4SVL.c b/kernels/armsme/1m/bli_packm_armsme_int_sSVLx4SVL.c new file mode 100644 index 000000000..ee65face6 --- /dev/null +++ b/kernels/armsme/1m/bli_packm_armsme_int_sSVLx4SVL.c @@ -0,0 +1,805 @@ +/* + * + * BLIS An object-based framework for developing high-performance BLAS-like + * libraries. + * + * Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, + * Linaro Limited + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. - + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. - Neither the + * name(s) of the copyright holder(s) nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include + +#include "blis.h" + +// MACROS FOR FALLTHROUGH LOGIC + +// PATH 1 + +// 1. Core Read & Store Logic +#define OP_VG4( TILE, TCOL, P_PTR ) \ + { \ + svcount_t p_all = svptrue_c32(); \ + svfloat32x4_t z = svread_ver_za32_f32_vg4( TILE, TCOL ); \ + svst1( p_all, P_PTR, z ); \ + P_PTR += ( 4 * SVL ); \ + } + +#define OP_VG2( TILE, TCOL, P_PTR ) \ + { \ + svcount_t p_all = svptrue_c32(); \ + svfloat32x2_t z = svread_ver_za32_f32_vg2( TILE, TCOL ); \ + svst1( p_all, P_PTR, z ); \ + P_PTR += ( 2 * SVL ); \ + } + +#define OP_VG1( TILE, TCOL, P_PTR ) \ + { \ + svbool_t p_true = svptrue_b32(); \ + svfloat32_t z = svread_ver_za32_m( z, p_true, TILE, TCOL ); \ + svst1( p_true, P_PTR, z ); \ + P_PTR += ( 1 * SVL ); \ + } + +// 2. Duff's Device Logic for a Partial Tile +#define PROCESS_PARTIAL_TILE( TILE_ID, REM, P_BASE ) \ + { \ + int tcol = 0; \ + float* p_curr = P_BASE; \ + int n4 = REM >> 2; /* Number of full VG4 blocks */ \ + \ + if ( n4 > 0 ) \ + { \ + switch ( n4 & 3 ) \ + { \ + case 0: \ + OP_VG4( TILE_ID, tcol, p_curr ); \ + tcol += 4; \ + case 3: \ + OP_VG4( TILE_ID, tcol, p_curr ); \ + tcol += 4; \ + case 2: \ + OP_VG4( TILE_ID, tcol, p_curr ); \ + tcol += 4; \ + case 1: \ + OP_VG4( TILE_ID, tcol, p_curr ); \ + tcol += 4; \ + } \ + } \ + \ + /* Handle remaining 1, 2, or 3 columns */ \ + switch ( REM & 3 ) \ + { \ + case 3: \ + OP_VG2( TILE_ID, tcol, p_curr ); \ + tcol += 2; \ + OP_VG1( TILE_ID, tcol, p_curr ); \ + break; \ + case 2: \ + OP_VG2( TILE_ID, tcol, p_curr ); \ + break; \ + case 1: \ + OP_VG1( TILE_ID, tcol, p_curr ); \ + break; \ + default: \ + break; \ + } \ + } + +// 3. Logic for a Full Tile +#define PROCESS_FULL_TILE( TILE_ID, P_BASE ) \ + { \ + float* p_curr = P_BASE; \ + for ( int tcol = 0; tcol < SVL; tcol += 4 ) \ + { \ + OP_VG4( TILE_ID, tcol, p_curr ); \ + } \ + } + +// PATH 2 + +// 1. Core Read, Shuffle & Store Logic +#define OP_SHUFFLED_VG4( TCOL, P_PTR ) \ + { \ + svcount_t p_all = svptrue_c32(); \ + svfloat32x4_t zq0 = svread_ver_za32_f32_vg4( 0, TCOL ); \ + svfloat32x4_t zq1 = svread_ver_za32_f32_vg4( 1, TCOL ); \ + svfloat32x4_t zq2 = svread_ver_za32_f32_vg4( 2, TCOL ); \ + svfloat32x4_t zq3 = svread_ver_za32_f32_vg4( 3, TCOL ); \ + \ + /* Shuffle and Store Column 0 (Offset 0) */ \ + svfloat32x4_t z_c0 = svcreate4( svget4( zq0, 0 ), svget4( zq1, 0 ), \ + svget4( zq2, 0 ), svget4( zq3, 0 ) ); \ + svst1( p_all, P_PTR, z_c0 ); \ + \ + /* Shuffle and Store Column 1 (Offset 4*SVL) */ \ + svfloat32x4_t z_c1 = svcreate4( svget4( zq0, 1 ), svget4( zq1, 1 ), \ + svget4( zq2, 1 ), svget4( zq3, 1 ) ); \ + svst1( p_all, P_PTR + 4 * SVL, z_c1 ); \ + \ + /* Shuffle and Store Column 2 (Offset 8*SVL) */ \ + svfloat32x4_t z_c2 = svcreate4( svget4( zq0, 2 ), svget4( zq1, 2 ), \ + svget4( zq2, 2 ), svget4( zq3, 2 ) ); \ + svst1( p_all, P_PTR + 8 * SVL, z_c2 ); \ + \ + /* Shuffle and Store Column 3 (Offset 12*SVL) */ \ + svfloat32x4_t z_c3 = svcreate4( svget4( zq0, 3 ), svget4( zq1, 3 ), \ + svget4( zq2, 3 ), svget4( zq3, 3 ) ); \ + svst1( p_all, P_PTR + 12 * SVL, z_c3 ); \ + \ + P_PTR += ( 16 * SVL ); \ + } + +#define OP_SHUFFLED_VG2( TCOL, P_PTR ) \ + { \ + svcount_t p_all = svptrue_c32(); \ + svfloat32x2_t zq0 = svread_ver_za32_f32_vg2( 0, TCOL ); \ + svfloat32x2_t zq1 = svread_ver_za32_f32_vg2( 1, TCOL ); \ + svfloat32x2_t zq2 = svread_ver_za32_f32_vg2( 2, TCOL ); \ + svfloat32x2_t zq3 = svread_ver_za32_f32_vg2( 3, TCOL ); \ + \ + svfloat32x4_t z_c0 = svcreate4( svget2( zq0, 0 ), svget2( zq1, 0 ), \ + svget2( zq2, 0 ), svget2( zq3, 0 ) ); \ + svst1( p_all, P_PTR, z_c0 ); \ + \ + svfloat32x4_t z_c1 = svcreate4( svget2( zq0, 1 ), svget2( zq1, 1 ), \ + svget2( zq2, 1 ), svget2( zq3, 1 ) ); \ + svst1( p_all, P_PTR + 4 * SVL, z_c1 ); \ + \ + P_PTR += ( 8 * SVL ); \ + } + +#define OP_SHUFFLED_VG1( TCOL, P_PTR ) \ + { \ + svbool_t p_true = svptrue_b32(); \ + svcount_t p_cnt = svptrue_c32(); \ + svfloat32_t z0 = svread_ver_za32_m( z0, p_true, 0, TCOL ); \ + svfloat32_t z1 = svread_ver_za32_m( z1, p_true, 1, TCOL ); \ + svfloat32_t z2 = svread_ver_za32_m( z2, p_true, 2, TCOL ); \ + svfloat32_t z3 = svread_ver_za32_m( z3, p_true, 3, TCOL ); \ + \ + svfloat32x4_t z_c0 = svcreate4( z0, z1, z2, z3 ); \ + svst1( p_cnt, P_PTR, z_c0 ); \ + \ + P_PTR += ( 4 * SVL ); \ + } + +__arm_new( "za" ) __arm_locally_streaming void bli_spackm_armsme_int_SVLx4SVL + ( + conj_t conja, + pack_t schema, + dim_t cdim_, + dim_t cdim_max, + dim_t cdim_bcast, + dim_t n_, + dim_t n_max_, + const void *kappa, + const void *a, inc_t inca_, inc_t lda_, + void *p, inc_t ldp_, + const void *params, + const cntx_t * cntx + ) +{ + const int64_t cdim = cdim_; + const int64_t n = n_; + const int64_t inca = inca_; + const int64_t lda = lda_; + const int64_t ldp = ldp_; + + float* restrict a_ = (float*)a; + float* restrict p_ = (float*)p; + + uint64_t SVL = svcntsw(); + + svfloat32x4_t tmp; + svfloat32_t tmp2; + + const float* restrict alpha1 = a; + float* restrict pi1 = p; + + const bool gs = ( inca != 1 && lda != 1 ); + if ( !gs && cdim_bcast ) + { + if ( bli_seq1( *( (float*)kappa ) ) ) + { + if ( inca == 1 && ldp == 4 * SVL ) + // continuous memory.packA style + { + svbool_t p0 = svwhilelt_b32( (int64_t)0, cdim ); + svbool_t p1 = svwhilelt_b32( (int64_t)SVL, cdim ); + svbool_t p2 = svwhilelt_b32( (int64_t)( 2 * SVL ), cdim ); + svbool_t p3 = svwhilelt_b32( (int64_t)( 3 * SVL ), cdim ); + + for ( dim_t k = n; k != 0; --k ) + { + svfloat32_t z0 = svld1_f32( p0, alpha1 + 0 * SVL ); + svfloat32_t z1 = svld1_f32( p1, alpha1 + 1 * SVL ); + svfloat32_t z2 = svld1_f32( p2, alpha1 + 2 * SVL ); + svfloat32_t z3 = svld1_f32( p3, alpha1 + 3 * SVL ); + + tmp = svcreate4( z0, z1, z2, z3 ); + + svst1_f32_x4( svptrue_c32(), pi1, tmp ); + + alpha1 += lda; + pi1 += ldp; + } + } + if ( inca == 1 && ldp == SVL ) + // continuous memory.packA style + { + svbool_t p0 = svwhilelt_b32( (int64_t)0, cdim ); + for ( dim_t k = n; k != 0; --k ) + { + tmp2 = svld1_f32( p0, alpha1 ); + svst1_f32( svptrue_b32(), pi1, tmp2 ); + + alpha1 += lda; + pi1 += ldp; + } + } + else if ( inca != 1 && ldp == SVL ) + { + for ( uint64_t col = 0; col < n; col += 4 * SVL ) + { + int64_t valid_cols = n - col; + + // Determine total valid rows for this vertical block + // (max SVL) + int64_t valid_rows = ( cdim % SVL == 0 ) ? SVL : + ( cdim % SVL ); + + // Generate the 4 standard SVE column predicates for the + // safe edge-case loads + svbool_t pc0 = svwhilelt_b32( (int64_t)0, valid_cols ); + svbool_t pc1 = svwhilelt_b32( (int64_t)( 1 * SVL ), + valid_cols ); + svbool_t pc2 = svwhilelt_b32( (int64_t)( 2 * SVL ), + valid_cols ); + svbool_t pc3 = svwhilelt_b32( (int64_t)( 3 * SVL ), + valid_cols ); + + svcount_t p_all = svptrue_c32(); + + for ( uint64_t trow = 0; trow < SVL; trow += 4 ) + { + const uint64_t tile_UL_corner = + ( /* row + */ trow ) * inca /* n */ + col; + + // 1. Create undefined default vectors + svfloat32_t undef_v = svundef_f32(); + svfloat32x4_t undef_x4 = svcreate4( undef_v, undef_v, + undef_v, undef_v ); + + // 2. Default all load arrays to empty + svfloat32x4_t zp0 = undef_x4, zp4 = undef_x4, + zp8 = undef_x4, zp12 = undef_x4; + + // 3. Calculate rows left for all tiles + int64_t rows_left = valid_rows - trow; + + // 4. Loads + if ( valid_cols >= 4 * SVL ) + { + // FAST PATH: All 4*SVL columns exist + if ( rows_left > 0 ) + zp0 = svld1_f32_x4( p_all, + &a_[tile_UL_corner + 0 * inca] ); + if ( rows_left > 1 ) + zp4 = svld1_f32_x4( p_all, + &a_[tile_UL_corner + 1 * inca] ); + if ( rows_left > 2 ) + zp8 = svld1_f32_x4( p_all, + &a_[tile_UL_corner + 2 * inca] ); + if ( rows_left > 3 ) + zp12 = svld1_f32_x4( p_all, + &a_[tile_UL_corner + 3 * inca] ); + } + else + { + // SAFE PATH: Matrix edge + if ( rows_left > 0 ) + { + zp0 = svcreate4( svld1_f32( pc0, + &a_[tile_UL_corner + + 0 * inca + 0 * SVL] ), + svld1_f32( pc1, + &a_[tile_UL_corner + 0 * inca + + 1 * SVL] ), + svld1_f32( pc2, + &a_[tile_UL_corner + 0 * inca + + 2 * SVL] ), + svld1_f32( pc3, + &a_[tile_UL_corner + 0 * inca + + 3 * SVL] ) ); + } + if ( rows_left > 1 ) + { + zp4 = svcreate4( svld1_f32( pc0, + &a_[tile_UL_corner + + 1 * inca + 0 * SVL] ), + svld1_f32( pc1, + &a_[tile_UL_corner + 1 * inca + + 1 * SVL] ), + svld1_f32( pc2, + &a_[tile_UL_corner + 1 * inca + + 2 * SVL] ), + svld1_f32( pc3, + &a_[tile_UL_corner + 1 * inca + + 3 * SVL] ) ); + } + if ( rows_left > 2 ) + { + zp8 = svcreate4( svld1_f32( pc0, + &a_[tile_UL_corner + + 2 * inca + 0 * SVL] ), + svld1_f32( pc1, + &a_[tile_UL_corner + 2 * inca + + 1 * SVL] ), + svld1_f32( pc2, + &a_[tile_UL_corner + 2 * inca + + 2 * SVL] ), + svld1_f32( pc3, + &a_[tile_UL_corner + 2 * inca + + 3 * SVL] ) ); + } + if ( rows_left > 3 ) + { + zp12 = svcreate4( svld1_f32( pc0, + &a_[tile_UL_corner + + 3 * inca + 0 * SVL] ), + svld1_f32( pc1, + &a_[tile_UL_corner + 3 * inca + + 1 * SVL] ), + svld1_f32( pc2, + &a_[tile_UL_corner + 3 * inca + + 2 * SVL] ), + svld1_f32( pc3, + &a_[tile_UL_corner + 3 * inca + + 3 * SVL] ) ); + } + } + + // 5. Shuffle into x4 tuples + svfloat32x4_t zq0 = svcreate4( svget4( zp0, 0 ), + svget4( zp4, 0 ), svget4( zp8, 0 ), + svget4( zp12, 0 ) ); + + svfloat32x4_t zq1 = svcreate4( svget4( zp0, 1 ), + svget4( zp4, 1 ), svget4( zp8, 1 ), + svget4( zp12, 1 ) ); + + svfloat32x4_t zq2 = svcreate4( svget4( zp0, 2 ), + svget4( zp4, 2 ), svget4( zp8, 2 ), + svget4( zp12, 2 ) ); + + svfloat32x4_t zq3 = svcreate4( svget4( zp0, 3 ), + svget4( zp4, 3 ), svget4( zp8, 3 ), + svget4( zp12, 3 ) ); + + // 6. Write into ZA + svwrite_hor_za32_f32_vg4( 0, trow, zq0 ); + svwrite_hor_za32_f32_vg4( 1, trow, zq1 ); + svwrite_hor_za32_f32_vg4( 2, trow, zq2 ); + svwrite_hor_za32_f32_vg4( 3, trow, zq3 ); + } + // Check if we are at the edge and fewer than + // 4 * SVL columns remain + if ( col + ( 4 * SVL ) > n ) + { + int total_rem = n - col; + + // --- TILE 0 --- + if ( total_rem >= (int)SVL ) + { + PROCESS_FULL_TILE( 0, &p_[0] ); + total_rem -= SVL; + } + else + { + PROCESS_PARTIAL_TILE( 0, total_rem, &p_[0] ); + total_rem = 0; + } + + // --- TILE 1 --- + if ( total_rem > 0 ) + { + if ( total_rem >= (int)SVL ) + { + PROCESS_FULL_TILE( 1, &p_[SVL * SVL] ); + total_rem -= SVL; + } + else + { + PROCESS_PARTIAL_TILE( 1, total_rem, + &p_[SVL * SVL] ); + total_rem = 0; + } + } + + // --- TILE 2 --- + if ( total_rem > 0 ) + { + if ( total_rem >= (int)SVL ) + { + PROCESS_FULL_TILE( 2, &p_[2 * SVL * SVL] ); + total_rem -= SVL; + } + else + { + PROCESS_PARTIAL_TILE( 2, total_rem, + &p_[2 * SVL * SVL] ); + total_rem = 0; + } + } + + // --- TILE 3 --- + if ( total_rem > 0 ) + { + PROCESS_PARTIAL_TILE( 3, total_rem, + &p_[3 * SVL * SVL] ); + } + } + + else + { + // Read - as - columns and store + for ( uint64_t tcol = 0; tcol < SVL; tcol += 4 ) + { + svcount_t p0 = svptrue_c32(); + + // Each svread_ver reads 4 columns of the tile(SVL). + svfloat32x4_t zq0 = svread_ver_za32_f32_vg4( + /* tile: */ 0, /* slice: */ tcol ); + svfloat32x4_t zq2 = svread_ver_za32_f32_vg4( + /* tile: */ 2, /* slice: */ tcol ); + + svfloat32x4_t zq1 = svread_ver_za32_f32_vg4( + /* tile: */ 1, /* slice: */ tcol ); + svfloat32x4_t zq3 = svread_ver_za32_f32_vg4( + /* tile: */ 3, /* slice: */ tcol ); + + svst1( p0, &p_[0], zq0 ); + svst1( p0, &p_[SVL * SVL], zq1 ); + svst1( p0, &p_[2 * SVL * SVL], zq2 ); + svst1( p0, &p_[3 * SVL * SVL], zq3 ); + + p_ += ( 4 * SVL ); + } + p_ += ( 3 * SVL * SVL ); + } + } + + p_ = (float*)p; + } + else if ( inca != 1 && ldp == 4 * SVL ) + { + for ( uint64_t col = 0; col < n; col += SVL ) + { + int64_t valid_cols = n - col; + + // Determine total valid rows for this vertical block + // (max 4 * SVL) + int64_t valid_rows = ( cdim % ( 4 * SVL ) == 0 ) ? + ( 4 * SVL ) : + ( cdim % ( 4 * SVL ) ); + + // Generate a standard SVE column predicate for the safe + // edge-case loads + svbool_t p_col = svwhilelt_b32( (int64_t)0, valid_cols ); + svbool_t p_all = svptrue_b32(); + + if ( valid_cols >= SVL && valid_rows >= 4 * SVL ) + { + // FAST PATH: Perfect 4*SVL x SVL block + for ( uint64_t trow = 0; trow < SVL; trow += 4 ) + { + const uint64_t tile_UL_corner = (trow)*inca + col; + const uint64_t tile_BL_corner = tile_UL_corner + + inca * SVL; + const uint64_t tile_BBL_corner = tile_UL_corner + + 2 * inca * SVL; + const uint64_t tile_BBBL_corner = tile_UL_corner + + 3 * inca * SVL; + + svfloat32x4_t zq0 = + svcreate4( svld1_f32( p_all, + &a_[tile_UL_corner + 0 * inca] ), + svld1_f32( p_all, + &a_[tile_UL_corner + 1 * inca] ), + svld1_f32( p_all, + &a_[tile_UL_corner + 2 * inca] ), + svld1_f32( p_all, + &a_[tile_UL_corner + 3 * inca] ) ); + + svfloat32x4_t zq1 = + svcreate4( svld1_f32( p_all, + &a_[tile_BL_corner + 0 * inca] ), + svld1_f32( p_all, + &a_[tile_BL_corner + 1 * inca] ), + svld1_f32( p_all, + &a_[tile_BL_corner + 2 * inca] ), + svld1_f32( p_all, + &a_[tile_BL_corner + 3 * inca] ) ); + + svfloat32x4_t zq2 = svcreate4( + svld1_f32( p_all, + &a_[tile_BBL_corner + 0 * inca] ), + svld1_f32( p_all, + &a_[tile_BBL_corner + 1 * inca] ), + svld1_f32( p_all, + &a_[tile_BBL_corner + 2 * inca] ), + svld1_f32( p_all, + &a_[tile_BBL_corner + 3 * inca] ) ); + + svfloat32x4_t zq3 = svcreate4( + svld1_f32( p_all, + &a_[tile_BBBL_corner + 0 * inca] ), + svld1_f32( p_all, + &a_[tile_BBBL_corner + 1 * inca] ), + svld1_f32( p_all, + &a_[tile_BBBL_corner + 2 * inca] ), + svld1_f32( p_all, + &a_[tile_BBBL_corner + 3 * inca] ) ); + + svwrite_hor_za32_f32_vg4( 0, trow, zq0 ); + svwrite_hor_za32_f32_vg4( 1, trow, zq1 ); + svwrite_hor_za32_f32_vg4( 2, trow, zq2 ); + svwrite_hor_za32_f32_vg4( 3, trow, zq3 ); + } + } + else + { + // SAFE PATH: Matrix edge + for ( uint64_t trow = 0; trow < SVL; trow += 4 ) + { + // 1. Create undefined default vectors + svfloat32_t undef_v = svundef_f32(); + svfloat32x4_t undef_x4 = svcreate4( undef_v, + undef_v, undef_v, undef_v ); + + // 2. Default all load arrays to empty + svfloat32x4_t zq0 = undef_x4, zq1 = undef_x4, + zq2 = undef_x4, zq3 = undef_x4; + + const uint64_t tile_UL_corner = (trow)*inca + col; + const uint64_t tile_BL_corner = tile_UL_corner + + inca * SVL; + const uint64_t tile_BBL_corner = tile_UL_corner + + 2 * inca * SVL; + const uint64_t tile_BBBL_corner = tile_UL_corner + + 3 * inca * SVL; + + // 3. Calculate rows left independently for each + // tile + int64_t rows_left_t0 = valid_rows - + ( 0 * SVL + trow ); + int64_t rows_left_t1 = valid_rows - + ( 1 * SVL + trow ); + int64_t rows_left_t2 = valid_rows - + ( 2 * SVL + trow ); + int64_t rows_left_t3 = valid_rows - + ( 3 * SVL + trow ); + + // 4. Loads for each tile + if ( rows_left_t0 > 0 ) + { + zq0 = svcreate4( ( rows_left_t0 > 0 ) ? + svld1_f32( p_col, + &a_[tile_UL_corner + 0 * inca] ) : + undef_v, + ( rows_left_t0 > 1 ) ? + svld1_f32( p_col, + &a_[tile_UL_corner + 1 * inca] ) : + undef_v, + ( rows_left_t0 > 2 ) ? + svld1_f32( p_col, + &a_[tile_UL_corner + 2 * inca] ) : + undef_v, + ( rows_left_t0 > 3 ) ? + svld1_f32( p_col, + &a_[tile_UL_corner + 3 * inca] ) : + undef_v ); + } + + if ( rows_left_t1 > 0 ) + { + zq1 = svcreate4( ( rows_left_t1 > 0 ) ? + svld1_f32( p_col, + &a_[tile_BL_corner + 0 * inca] ) : + undef_v, + ( rows_left_t1 > 1 ) ? + svld1_f32( p_col, + &a_[tile_BL_corner + 1 * inca] ) : + undef_v, + ( rows_left_t1 > 2 ) ? + svld1_f32( p_col, + &a_[tile_BL_corner + 2 * inca] ) : + undef_v, + ( rows_left_t1 > 3 ) ? + svld1_f32( p_col, + &a_[tile_BL_corner + 3 * inca] ) : + undef_v ); + } + + if ( rows_left_t2 > 0 ) + { + zq2 = svcreate4( ( rows_left_t2 > 0 ) ? + svld1_f32( p_col, + &a_[tile_BBL_corner + 0 * inca] ) : + undef_v, + ( rows_left_t2 > 1 ) ? + svld1_f32( p_col, + &a_[tile_BBL_corner + 1 * inca] ) : + undef_v, + ( rows_left_t2 > 2 ) ? + svld1_f32( p_col, + &a_[tile_BBL_corner + 2 * inca] ) : + undef_v, + ( rows_left_t2 > 3 ) ? + svld1_f32( p_col, + &a_[tile_BBL_corner + 3 * inca] ) : + undef_v ); + } + + if ( rows_left_t3 > 0 ) + { + zq3 = svcreate4( ( rows_left_t3 > 0 ) ? + svld1_f32( p_col, + &a_[tile_BBBL_corner + 0 * inca] ) : + undef_v, + ( rows_left_t3 > 1 ) ? + svld1_f32( p_col, + &a_[tile_BBBL_corner + 1 * inca] ) : + undef_v, + ( rows_left_t3 > 2 ) ? + svld1_f32( p_col, + &a_[tile_BBBL_corner + 2 * inca] ) : + undef_v, + ( rows_left_t3 > 3 ) ? + svld1_f32( p_col, + &a_[tile_BBBL_corner + 3 * inca] ) : + undef_v ); + } + + // 5. Write into ZA + svwrite_hor_za32_f32_vg4( 0, trow, zq0 ); + svwrite_hor_za32_f32_vg4( 1, trow, zq1 ); + svwrite_hor_za32_f32_vg4( 2, trow, zq2 ); + svwrite_hor_za32_f32_vg4( 3, trow, zq3 ); + } + } + + // Check if we are at the edge and fewer than + // SVL columns remain + if ( col + SVL > n ) + { + int rem = n - col; + int tcol = 0; + + // 1. Process as many full VG4 blocks as possible + while ( rem >= 4 ) + { + OP_SHUFFLED_VG4( tcol, p_ ); + tcol += 4; + rem -= 4; + } + + // 2. Process a VG2 block if remaining + if ( rem >= 2 ) + { + OP_SHUFFLED_VG2( tcol, p_ ); + tcol += 2; + rem -= 2; + } + + // 3. Process the last column if remaining + if ( rem >= 1 ) + { + OP_SHUFFLED_VG1( tcol, p_ ); + } + } + else + { + // Read - as - columns and store + for ( uint64_t tcol = 0; tcol < SVL; tcol += 4 ) + { + svcount_t p0 = svptrue_c32(); + + // Each svread_ver reads 4 columns of the tile(SVL). + svfloat32x4_t zq0 = svread_ver_za32_f32_vg4( + /* tile: */ 0, /* slice: */ tcol ); + svfloat32x4_t zq2 = svread_ver_za32_f32_vg4( + /* tile: */ 2, /* slice: */ tcol ); + + svfloat32x4_t zq1 = svread_ver_za32_f32_vg4( + /* tile: */ 1, /* slice: */ tcol ); + svfloat32x4_t zq3 = svread_ver_za32_f32_vg4( + /* tile: */ 3, /* slice: */ tcol ); + + svfloat32x4_t zq0_ = svcreate4( svget4( zq0, 0 ), + svget4( zq1, 0 ), svget4( zq2, 0 ), + svget4( zq3, 0 ) ); + + svfloat32x4_t zq1_ = svcreate4( svget4( zq0, 1 ), + svget4( zq1, 1 ), svget4( zq2, 1 ), + svget4( zq3, 1 ) ); + + svfloat32x4_t zq2_ = svcreate4( svget4( zq0, 2 ), + svget4( zq1, 2 ), svget4( zq2, 2 ), + svget4( zq3, 2 ) ); + + svfloat32x4_t zq3_ = svcreate4( svget4( zq0, 3 ), + svget4( zq1, 3 ), svget4( zq2, 3 ), + svget4( zq3, 3 ) ); + + svst1( p0, &p_[0], zq0_ ); + svst1( p0, &p_[4 * SVL], zq1_ ); + svst1( p0, &p_[8 * SVL], zq2_ ); + svst1( p0, &p_[12 * SVL], zq3_ ); + + p_ += ( 16 * SVL ); + } + } + } + + p_ = (float*)p; + } + } + else + { + bli_sscal2bbs_mxn + ( + conja, + cdim_, + n_, + kappa, + a, inca, lda, + p_, cdim_bcast, ldp + ); + } + } + else + { + bli_sscal2bbs_mxn + ( + conja, + cdim_, + n_, + kappa, + a, inca, lda, + p_, cdim_bcast, ldp + ); + } + + bli_sset0s_edge + ( + cdim_ * cdim_bcast, cdim_max * cdim_bcast, + n_, n_max_, + p_, ldp + ); +} diff --git a/kernels/armsme/3/bli_gemm_armsme_int_2SVLx2SVL.c b/kernels/armsme/3/bli_gemm_armsme_int_2SVLx2SVL.c new file mode 100644 index 000000000..462f6a9ad --- /dev/null +++ b/kernels/armsme/3/bli_gemm_armsme_int_2SVLx2SVL.c @@ -0,0 +1,1818 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2021, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are +met: +- Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name(s) of the copyright holder(s) nor the names of its +contributors may be used to endorse or promote products derived +from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + */ + +#include +#include +#include "blis.h" + +__arm_new( "za" ) __arm_locally_streaming void bli_sgemm_armsme_int_2SVLx2SVL + ( + dim_t m, + dim_t n, + dim_t k, + const void* alpha, + const void* a, + const void* b, + const void* beta, + void* c, inc_t rs_c, inc_t cs_c, + const auxinfo_t* data, + const cntx_t* cntx + ) +{ + uint64_t SVL = svcntsw(); + + GEMM_UKR_SETUP_CT_AMBI( s, 2 * SVL, 2 * SVL, false ); + + float *a_ = (float *)a; + float *b_ = (float *)b; + + float *a_next = (float *)bli_auxinfo_next_a( data ); + float *b_next = (float *)bli_auxinfo_next_b( data ); + + float *c_ = (float *)c; + + const uint64_t result_tile_TL_corner_ = 0; + const uint64_t result_tile_TR_corner_ = result_tile_TL_corner_ + SVL; + + if ( cs_c != 1 ) + { + __pldx( 1, 1, 0, + (float *)&c_[result_tile_TL_corner_ + ( ( ( 0 + 0 ) * cs_c ) )] ); + __pldx( 1, 1, 0, + (float *)&c_[result_tile_TL_corner_ + ( ( ( 1 + 0 ) * cs_c ) )] ); + __pldx( 1, 1, 0, + (float *)&c_[result_tile_TL_corner_ + ( ( ( 2 + 0 ) * cs_c ) )] ); + __pldx( 1, 1, 0, + (float *)&c_[result_tile_TL_corner_ + ( ( ( 3 + 0 ) * cs_c ) )] ); + + __pldx( 1, 1, 0, + (float *)&c_[result_tile_TL_corner_ + ( ( ( 4 + 0 ) * cs_c ) )] ); + __pldx( 1, 1, 0, + (float *)&c_[result_tile_TL_corner_ + ( ( ( 5 + 0 ) * cs_c ) )] ); + __pldx( 1, 1, 0, + (float *)&c_[result_tile_TL_corner_ + ( ( ( 6 + 0 ) * cs_c ) )] ); + __pldx( 1, 1, 0, + (float *)&c_[result_tile_TL_corner_ + ( ( ( 7 + 0 ) * cs_c ) )] ); + + __pldx( 1, 1, 0, + (float *)&c_[result_tile_TR_corner_ + ( ( ( 0 + 0 ) * cs_c ) )] ); + __pldx( 1, 1, 0, + (float *)&c_[result_tile_TR_corner_ + ( ( ( 1 + 0 ) * cs_c ) )] ); + __pldx( 1, 1, 0, + (float *)&c_[result_tile_TR_corner_ + ( ( ( 2 + 0 ) * cs_c ) )] ); + __pldx( 1, 1, 0, + (float *)&c_[result_tile_TR_corner_ + ( ( ( 3 + 0 ) * cs_c ) )] ); + + __pldx( 1, 1, 0, + (float *)&c_[result_tile_TR_corner_ + ( ( ( 4 + 0 ) * cs_c ) )] ); + __pldx( 1, 1, 0, + (float *)&c_[result_tile_TR_corner_ + ( ( ( 5 + 0 ) * cs_c ) )] ); + __pldx( 1, 1, 0, + (float *)&c_[result_tile_TR_corner_ + ( ( ( 6 + 0 ) * cs_c ) )] ); + __pldx( 1, 1, 0, + (float *)&c_[result_tile_TR_corner_ + ( ( ( 7 + 0 ) * cs_c ) )] ); + } + else + { + __pldx( 1, 1, 0, + (float *)&c_[result_tile_TL_corner_ + ( ( ( 0 + 0 ) * rs_c ) )] ); + __pldx( 1, 1, 0, + (float *)&c_[result_tile_TL_corner_ + ( ( ( 1 + 0 ) * rs_c ) )] ); + __pldx( 1, 1, 0, + (float *)&c_[result_tile_TL_corner_ + ( ( ( 2 + 0 ) * rs_c ) )] ); + __pldx( 1, 1, 0, + (float *)&c_[result_tile_TL_corner_ + ( ( ( 3 + 0 ) * rs_c ) )] ); + + __pldx( 1, 1, 0, + (float *)&c_[result_tile_TL_corner_ + ( ( ( 4 + 0 ) * rs_c ) )] ); + __pldx( 1, 1, 0, + (float *)&c_[result_tile_TL_corner_ + ( ( ( 5 + 0 ) * rs_c ) )] ); + __pldx( 1, 1, 0, + (float *)&c_[result_tile_TL_corner_ + ( ( ( 6 + 0 ) * rs_c ) )] ); + __pldx( 1, 1, 0, + (float *)&c_[result_tile_TL_corner_ + ( ( ( 7 + 0 ) * rs_c ) )] ); + + __pldx( 1, 1, 0, + (float *)&c_[result_tile_TR_corner_ + ( ( ( 0 + 0 ) * rs_c ) )] ); + __pldx( 1, 1, 0, + (float *)&c_[result_tile_TR_corner_ + ( ( ( 1 + 0 ) * rs_c ) )] ); + __pldx( 1, 1, 0, + (float *)&c_[result_tile_TR_corner_ + ( ( ( 2 + 0 ) * rs_c ) )] ); + __pldx( 1, 1, 0, + (float *)&c_[result_tile_TR_corner_ + ( ( ( 3 + 0 ) * rs_c ) )] ); + + __pldx( 1, 1, 0, + (float *)&c_[result_tile_TR_corner_ + ( ( ( 4 + 0 ) * rs_c ) )] ); + __pldx( 1, 1, 0, + (float *)&c_[result_tile_TR_corner_ + ( ( ( 5 + 0 ) * rs_c ) )] ); + __pldx( 1, 1, 0, + (float *)&c_[result_tile_TR_corner_ + ( ( ( 6 + 0 ) * rs_c ) )] ); + __pldx( 1, 1, 0, + (float *)&c_[result_tile_TR_corner_ + ( ( ( 7 + 0 ) * rs_c ) )] ); + } + + svzero_za(); + + uint64_t k_; + uint64_t k_iter = k / 8; + uint64_t k_left = k % 8; + + for ( k_ = 0; k_ < k_iter; k_++ ) + { + svfloat32x4_t zL00 = svld1_f32_x4( svptrue_c32(), + (float32_t *)( &a_[0] ) ); + svfloat32x4_t zR00 = svld1_f32_x4( svptrue_c32(), + (float32_t *)( &b_[0] ) ); + + svmopa_za32_m( 0, svptrue_b32(), svptrue_b32(), svget4( zL00, 0 ), + svget4( zR00, 0 ) ); + svmopa_za32_m( 1, svptrue_b32(), svptrue_b32(), svget4( zL00, 1 ), + svget4( zR00, 0 ) ); + + __pldx( 0, 1, 1, (float *)&a_next[0] ); + __pldx( 0, 1, 1, (float *)&b_next[0] ); + + svmopa_za32_m( 2, svptrue_b32(), svptrue_b32(), svget4( zL00, 0 ), + svget4( zR00, 1 ) ); + svmopa_za32_m( 3, svptrue_b32(), svptrue_b32(), svget4( zL00, 1 ), + svget4( zR00, 1 ) ); + + svmopa_za32_m( 0, svptrue_b32(), svptrue_b32(), svget4( zL00, 2 ), + svget4( zR00, 2 ) ); + svmopa_za32_m( 1, svptrue_b32(), svptrue_b32(), svget4( zL00, 3 ), + svget4( zR00, 2 ) ); + + svmopa_za32_m( 2, svptrue_b32(), svptrue_b32(), svget4( zL00, 2 ), + svget4( zR00, 3 ) ); + svmopa_za32_m( 3, svptrue_b32(), svptrue_b32(), svget4( zL00, 3 ), + svget4( zR00, 3 ) ); + + svfloat32x4_t zL02 = svld1_f32_x4( svptrue_c32(), + (float32_t *)( &a_[( 4 * SVL )] ) ); + svfloat32x4_t zR02 = svld1_f32_x4( svptrue_c32(), + (float32_t *)( &b_[( 4 * SVL )] ) ); + + svmopa_za32_m( 0, svptrue_b32(), svptrue_b32(), svget4( zL02, 0 ), + svget4( zR02, 0 ) ); + svmopa_za32_m( 1, svptrue_b32(), svptrue_b32(), svget4( zL02, 1 ), + svget4( zR02, 0 ) ); + + __pldx( 0, 1, 1, (float *)&a_next[4 * SVL] ); + __pldx( 0, 1, 1, (float *)&b_next[4 * SVL] ); + + svmopa_za32_m( 2, svptrue_b32(), svptrue_b32(), svget4( zL02, 0 ), + svget4( zR02, 1 ) ); + svmopa_za32_m( 3, svptrue_b32(), svptrue_b32(), svget4( zL02, 1 ), + svget4( zR02, 1 ) ); + + svmopa_za32_m( 0, svptrue_b32(), svptrue_b32(), svget4( zL02, 2 ), + svget4( zR02, 2 ) ); + svmopa_za32_m( 1, svptrue_b32(), svptrue_b32(), svget4( zL02, 3 ), + svget4( zR02, 2 ) ); + svmopa_za32_m( 2, svptrue_b32(), svptrue_b32(), svget4( zL02, 2 ), + svget4( zR02, 3 ) ); + svmopa_za32_m( 3, svptrue_b32(), svptrue_b32(), svget4( zL02, 3 ), + svget4( zR02, 3 ) ); + + svfloat32x4_t zL04 = svld1_f32_x4( svptrue_c32(), + (float32_t *)( &a_[8 * SVL] ) ); + svfloat32x4_t zR04 = svld1_f32_x4( svptrue_c32(), + (float32_t *)( &b_[8 * SVL] ) ); + + svmopa_za32_m( 0, svptrue_b32(), svptrue_b32(), svget4( zL04, 0 ), + svget4( zR04, 0 ) ); + svmopa_za32_m( 1, svptrue_b32(), svptrue_b32(), svget4( zL04, 1 ), + svget4( zR04, 0 ) ); + + __pldx( 0, 1, 1, (float *)&a_next[8 * SVL] ); + __pldx( 0, 1, 1, (float *)&b_next[8 * SVL] ); + + svmopa_za32_m( 2, svptrue_b32(), svptrue_b32(), svget4( zL04, 0 ), + svget4( zR04, 1 ) ); + svmopa_za32_m( 3, svptrue_b32(), svptrue_b32(), svget4( zL04, 1 ), + svget4( zR04, 1 ) ); + + svmopa_za32_m( 0, svptrue_b32(), svptrue_b32(), svget4( zL04, 2 ), + svget4( zR04, 2 ) ); + svmopa_za32_m( 1, svptrue_b32(), svptrue_b32(), svget4( zL04, 3 ), + svget4( zR04, 2 ) ); + + svmopa_za32_m( 2, svptrue_b32(), svptrue_b32(), svget4( zL04, 2 ), + svget4( zR04, 3 ) ); + svmopa_za32_m( 3, svptrue_b32(), svptrue_b32(), svget4( zL04, 3 ), + svget4( zR04, 3 ) ); + + svfloat32x4_t zL06 = svld1_f32_x4( svptrue_c32(), + (float32_t *)( &a_[( 12 * SVL )] ) ); + svfloat32x4_t zR06 = svld1_f32_x4( svptrue_c32(), + (float32_t *)( &b_[( 12 * SVL )] ) ); + + svmopa_za32_m( 0, svptrue_b32(), svptrue_b32(), svget4( zL06, 0 ), + svget4( zR06, 0 ) ); + svmopa_za32_m( 1, svptrue_b32(), svptrue_b32(), svget4( zL06, 1 ), + svget4( zR06, 0 ) ); + + __pldx( 0, 1, 1, (float *)&a_next[12 * SVL] ); + __pldx( 0, 1, 1, (float *)&b_next[12 * SVL] ); + + svmopa_za32_m( 2, svptrue_b32(), svptrue_b32(), svget4( zL06, 0 ), + svget4( zR06, 1 ) ); + svmopa_za32_m( 3, svptrue_b32(), svptrue_b32(), svget4( zL06, 1 ), + svget4( zR06, 1 ) ); + + svmopa_za32_m( 0, svptrue_b32(), svptrue_b32(), svget4( zL06, 2 ), + svget4( zR06, 2 ) ); + svmopa_za32_m( 1, svptrue_b32(), svptrue_b32(), svget4( zL06, 3 ), + svget4( zR06, 2 ) ); + + svmopa_za32_m( 2, svptrue_b32(), svptrue_b32(), svget4( zL06, 2 ), + svget4( zR06, 3 ) ); + svmopa_za32_m( 3, svptrue_b32(), svptrue_b32(), svget4( zL06, 3 ), + svget4( zR06, 3 ) ); + + a_ += ( 2 * 8 * SVL ); + b_ += ( 2 * 8 * SVL ); + + a_next += ( 2 * 8 * SVL ); + b_next += ( 2 * 8 * SVL ); + } + + for ( k_ = 0; k_ < k_left; k_ += 1 ) + { + svfloat32x2_t zL00 = svld1_f32_x2( svptrue_c32(), + (float32_t *)( &a_[0] ) ); + svfloat32x2_t zR00 = svld1_f32_x2( svptrue_c32(), + (float32_t *)( &b_[0] ) ); + + svmopa_za32_m( 0, svptrue_b32(), svptrue_b32(), svget2( zL00, 0 ), + svget2( zR00, 0 ) ); + svmopa_za32_m( 1, svptrue_b32(), svptrue_b32(), svget2( zL00, 1 ), + svget2( zR00, 0 ) ); + + svmopa_za32_m( 2, svptrue_b32(), svptrue_b32(), svget2( zL00, 0 ), + svget2( zR00, 1 ) ); + svmopa_za32_m( 3, svptrue_b32(), svptrue_b32(), svget2( zL00, 1 ), + svget2( zR00, 1 ) ); + + a_ += ( 2 * SVL ); + b_ += ( 2 * SVL ); + } + + // Store ZA to matResult. + + const uint64_t result_tile_TL_corner = 0; + + float beta_ = *(float *)beta; + float alpha_ = *(float *)alpha; + + svfloat32_t zbeta = svdup_f32( beta_ ); + svfloat32_t zalpha = svdup_f32( alpha_ ); + + if ( rs_c == 1 ) + { + const uint64_t result_tile_TR_corner = SVL * cs_c; + + if ( beta_ == 0 ) + { + for ( uint64_t tcol = 0; tcol < SVL; tcol += 4 ) + { + // Read ZA slices into Z regs + svfloat32_t z0 = svread_ver_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 0 ); + svfloat32_t z1 = svread_ver_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 0 ); + svfloat32_t z2 = svread_ver_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 0 ); + svfloat32_t z3 = svread_ver_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 0 ); + + // Scale Z regs by broadcast alpha + z0 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + // Store full result into C + svfloat32x2_t z400 = svcreate2( z0, z1 ); + svst1_f32_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 0 ) * cs_c], z400 ); + + svfloat32x2_t z600 = svcreate2( z2, z3 ); + svst1_f32_x2( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 0 ) * cs_c], z600 ); + + // Repeat unfolded x4 + z0 = svread_ver_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 1 ); + z1 = svread_ver_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 1 ); + z2 = svread_ver_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 1 ); + z3 = svread_ver_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 1 ); + + z0 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + z400 = svcreate2( z0, z1 ); + svst1_f32_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 1 ) * cs_c], z400 ); + + z600 = svcreate2( z2, z3 ); + svst1_f32_x2( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 1 ) * cs_c], z600 ); + + z0 = svread_ver_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 2 ); + z1 = svread_ver_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 2 ); + z2 = svread_ver_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 2 ); + z3 = svread_ver_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 2 ); + + z0 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + z400 = svcreate2( z0, z1 ); + svst1_f32_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 2 ) * cs_c], z400 ); + + z600 = svcreate2( z2, z3 ); + svst1_f32_x2( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 2 ) * cs_c], z600 ); + + z0 = svread_ver_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 3 ); + z1 = svread_ver_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 3 ); + z2 = svread_ver_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 3 ); + z3 = svread_ver_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 3 ); + + z0 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + z400 = svcreate2( z0, z1 ); + svst1_f32_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 3 ) * cs_c], z400 ); + + z600 = svcreate2( z2, z3 ); + svst1_f32_x2( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 3 ) * cs_c], z600 ); + } + } + + // beta != 0 + else + { + for ( uint64_t tcol = 0; tcol < SVL; tcol += 4 ) + { + // Read ZA slices into Z regs + svfloat32_t z0 = svread_ver_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 0 ); + svfloat32_t z1 = svread_ver_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 0 ); + svfloat32_t z2 = svread_ver_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 0 ); + svfloat32_t z3 = svread_ver_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 0 ); + + // Scale Z regs by broadcast alpha + svfloat32_t z00 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + svfloat32_t z10 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + svfloat32_t z20 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + svfloat32_t z30 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + // Load C into Z regs + svfloat32x2_t zq5 = svld1_f32_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 0 ) * cs_c ) )] ); + svfloat32x2_t zq6 = svld1_f32_x2( svptrue_c32(), + &c_[result_tile_TR_corner + ( ( ( tcol + 0 ) * cs_c ) )] ); + + // Scale Z regs by broadcast beta + svfloat32_t z40 = svmla_m( svptrue_b32(), z00, svget2( zq5, 0 ), + zbeta ); + svfloat32_t z50 = svmla_m( svptrue_b32(), z10, svget2( zq5, 1 ), + zbeta ); + svfloat32_t z60 = svmla_m( svptrue_b32(), z20, svget2( zq6, 0 ), + zbeta ); + svfloat32_t z70 = svmla_m( svptrue_b32(), z30, svget2( zq6, 1 ), + zbeta ); + + // Store full result into C + svfloat32x2_t z400 = svcreate2( z40, z50 ); + svst1_f32_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 0 ) * cs_c], z400 ); + + svfloat32x2_t z600 = svcreate2( z60, z70 ); + svst1_f32_x2( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 0 ) * cs_c], z600 ); + + // Repeat unfolded x4 + svfloat32_t z01 = svread_ver_za32_m( z01, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 1 ); + svfloat32_t z11 = svread_ver_za32_m( z11, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 1 ); + svfloat32_t z21 = svread_ver_za32_m( z21, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 1 ); + svfloat32_t z31 = svread_ver_za32_m( z31, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 1 ); + + svfloat32_t z02 = svmul_f32_z( svptrue_b32(), z01, zalpha ); + svfloat32_t z12 = svmul_f32_z( svptrue_b32(), z11, zalpha ); + svfloat32_t z22 = svmul_f32_z( svptrue_b32(), z21, zalpha ); + svfloat32_t z32 = svmul_f32_z( svptrue_b32(), z31, zalpha ); + + svfloat32x2_t zq51 = svld1_f32_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 1 ) * cs_c ) )] ); + svfloat32x2_t zq61 = svld1_f32_x2( svptrue_c32(), + &c_[result_tile_TR_corner + ( ( ( tcol + 1 ) * cs_c ) )] ); + + svfloat32_t z401 = svmla_m( svptrue_b32(), z02, + svget2( zq51, 0 ), zbeta ); + svfloat32_t z501 = svmla_m( svptrue_b32(), z12, + svget2( zq51, 1 ), zbeta ); + svfloat32_t z601 = svmla_m( svptrue_b32(), z22, + svget2( zq61, 0 ), zbeta ); + svfloat32_t z701 = svmla_m( svptrue_b32(), z32, + svget2( zq61, 1 ), zbeta ); + + svfloat32x2_t z4001 = svcreate2( z401, z501 ); + svst1_f32_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 1 ) * cs_c], z4001 ); + + svfloat32x2_t z6001 = svcreate2( z601, z701 ); + svst1_f32_x2( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 1 ) * cs_c], z6001 ); + + svfloat32_t z03 = svread_ver_za32_m( z03, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 2 ); + svfloat32_t z13 = svread_ver_za32_m( z13, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 2 ); + svfloat32_t z23 = svread_ver_za32_m( z23, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 2 ); + svfloat32_t z33 = svread_ver_za32_m( z33, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 2 ); + + svfloat32_t z04 = svmul_f32_z( svptrue_b32(), z03, zalpha ); + svfloat32_t z14 = svmul_f32_z( svptrue_b32(), z13, zalpha ); + svfloat32_t z24 = svmul_f32_z( svptrue_b32(), z23, zalpha ); + svfloat32_t z34 = svmul_f32_z( svptrue_b32(), z33, zalpha ); + + svfloat32x2_t zq52 = svld1_f32_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 2 ) * cs_c ) )] ); + svfloat32x2_t zq62 = svld1_f32_x2( svptrue_c32(), + &c_[result_tile_TR_corner + ( ( ( tcol + 2 ) * cs_c ) )] ); + + svfloat32_t z402 = svmla_m( svptrue_b32(), z04, + svget2( zq52, 0 ), zbeta ); + svfloat32_t z502 = svmla_m( svptrue_b32(), z14, + svget2( zq52, 1 ), zbeta ); + svfloat32_t z602 = svmla_m( svptrue_b32(), z24, + svget2( zq62, 0 ), zbeta ); + svfloat32_t z702 = svmla_m( svptrue_b32(), z34, + svget2( zq62, 1 ), zbeta ); + + svfloat32x2_t z4002 = svcreate2( z402, z502 ); + svst1_f32_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 2 ) * cs_c], z4002 ); + + svfloat32x2_t z6002 = svcreate2( z602, z702 ); + svst1_f32_x2( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 2 ) * cs_c], z6002 ); + + svfloat32_t z05 = svread_ver_za32_m( z05, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 3 ); + svfloat32_t z15 = svread_ver_za32_m( z15, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 3 ); + svfloat32_t z25 = svread_ver_za32_m( z25, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 3 ); + svfloat32_t z35 = svread_ver_za32_m( z35, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 3 ); + + svfloat32_t z06 = svmul_f32_z( svptrue_b32(), z05, zalpha ); + svfloat32_t z16 = svmul_f32_z( svptrue_b32(), z15, zalpha ); + svfloat32_t z26 = svmul_f32_z( svptrue_b32(), z25, zalpha ); + svfloat32_t z36 = svmul_f32_z( svptrue_b32(), z35, zalpha ); + + svfloat32x2_t zq53 = svld1_f32_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 3 ) * cs_c ) )] ); + svfloat32x2_t zq63 = svld1_f32_x2( svptrue_c32(), + &c_[result_tile_TR_corner + ( ( ( tcol + 3 ) * cs_c ) )] ); + + svfloat32_t z403 = svmla_m( svptrue_b32(), z06, + svget2( zq53, 0 ), zbeta ); + svfloat32_t z503 = svmla_m( svptrue_b32(), z16, + svget2( zq53, 1 ), zbeta ); + svfloat32_t z603 = svmla_m( svptrue_b32(), z26, + svget2( zq63, 0 ), zbeta ); + svfloat32_t z703 = svmla_m( svptrue_b32(), z36, + svget2( zq63, 1 ), zbeta ); + + svfloat32x2_t z4003 = svcreate2( z403, z503 ); + svst1_f32_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 3 ) * cs_c], z4003 ); + + svfloat32x2_t z6003 = svcreate2( z603, z703 ); + svst1_f32_x2( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 3 ) * cs_c], z6003 ); + } + } + } + else + { + const uint64_t result_tile_BL_corner = SVL * rs_c; + + if ( beta_ == 0 ) + { + for ( uint64_t tcol = 0; tcol < SVL; tcol += 4 ) + { + // Read ZA slices into Z regs + svfloat32_t z0 = svread_hor_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 0 ); + svfloat32_t z1 = svread_hor_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 0 ); + svfloat32_t z2 = svread_hor_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 0 ); + svfloat32_t z3 = svread_hor_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 0 ); + + // Scale Z regs by broadcast alpha + z0 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + // Store full result into C + svfloat32x2_t z400 = svcreate2( z0, z2 ); + svst1_f32_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 0 ) * rs_c], z400 ); + + svfloat32x2_t z600 = svcreate2( z1, z3 ); + svst1_f32_x2( svptrue_c32(), + &c_[result_tile_BL_corner + ( tcol + 0 ) * rs_c], z600 ); + + // Repeat unfolded x4 + z0 = svread_hor_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 1 ); + z1 = svread_hor_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 1 ); + z2 = svread_hor_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 1 ); + z3 = svread_hor_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 1 ); + + z0 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + z400 = svcreate2( z0, z2 ); + svst1_f32_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 1 ) * rs_c], z400 ); + + z600 = svcreate2( z1, z3 ); + svst1_f32_x2( svptrue_c32(), + &c_[result_tile_BL_corner + ( tcol + 1 ) * rs_c], z600 ); + + z0 = svread_hor_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 2 ); + z1 = svread_hor_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 2 ); + z2 = svread_hor_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 2 ); + z3 = svread_hor_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 2 ); + + z0 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + z400 = svcreate2( z0, z2 ); + svst1_f32_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 2 ) * rs_c], z400 ); + + z600 = svcreate2( z1, z3 ); + svst1_f32_x2( svptrue_c32(), + &c_[result_tile_BL_corner + ( tcol + 2 ) * rs_c], z600 ); + + z0 = svread_hor_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 3 ); + z1 = svread_hor_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 3 ); + z2 = svread_hor_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 3 ); + z3 = svread_hor_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 3 ); + + z0 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + z400 = svcreate2( z0, z2 ); + svst1_f32_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 3 ) * rs_c], z400 ); + + z600 = svcreate2( z1, z3 ); + svst1_f32_x2( svptrue_c32(), + &c_[result_tile_BL_corner + ( tcol + 3 ) * rs_c], z600 ); + } + } + else + { + for ( uint64_t tcol = 0; tcol < SVL; tcol += 4 ) + { + // Read ZA slices into Z regs + svfloat32_t z0 = svread_hor_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 0 ); + svfloat32_t z1 = svread_hor_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 0 ); + svfloat32_t z2 = svread_hor_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 0 ); + svfloat32_t z3 = svread_hor_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 0 ); + + // Scale Z regs by broadcast alpha + svfloat32_t z00 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + svfloat32_t z10 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + svfloat32_t z20 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + svfloat32_t z30 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + // Load C into Z regs + svfloat32x2_t zq5 = svld1_f32_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 0 ) * rs_c ) )] ); + svfloat32x2_t zq6 = svld1_f32_x2( svptrue_c32(), + &c_[result_tile_BL_corner + ( ( ( tcol + 0 ) * rs_c ) )] ); + + // Scale Z regs by broadcast beta + svfloat32_t z40 = svmla_m( svptrue_b32(), z00, svget2( zq5, 0 ), + zbeta ); + svfloat32_t z50 = svmla_m( svptrue_b32(), z10, svget2( zq6, 0 ), + zbeta ); + svfloat32_t z60 = svmla_m( svptrue_b32(), z20, svget2( zq5, 1 ), + zbeta ); + svfloat32_t z70 = svmla_m( svptrue_b32(), z30, svget2( zq6, 1 ), + zbeta ); + + // Store full result into C + svfloat32x2_t z400 = svcreate2( z40, z60 ); + svst1_f32_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 0 ) * rs_c], z400 ); + + svfloat32x2_t z600 = svcreate2( z50, z70 ); + svst1_f32_x2( svptrue_c32(), + &c_[result_tile_BL_corner + ( tcol + 0 ) * rs_c], z600 ); + + // Repeat unfolded x4 + svfloat32_t z01 = svread_hor_za32_m( z01, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 1 ); + svfloat32_t z11 = svread_hor_za32_m( z11, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 1 ); + svfloat32_t z21 = svread_hor_za32_m( z21, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 1 ); + svfloat32_t z31 = svread_hor_za32_m( z31, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 1 ); + + svfloat32_t z02 = svmul_f32_z( svptrue_b32(), z01, zalpha ); + svfloat32_t z12 = svmul_f32_z( svptrue_b32(), z11, zalpha ); + svfloat32_t z22 = svmul_f32_z( svptrue_b32(), z21, zalpha ); + svfloat32_t z32 = svmul_f32_z( svptrue_b32(), z31, zalpha ); + + svfloat32x2_t zq51 = svld1_f32_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 1 ) * rs_c ) )] ); + svfloat32x2_t zq61 = svld1_f32_x2( svptrue_c32(), + &c_[result_tile_BL_corner + ( ( ( tcol + 1 ) * rs_c ) )] ); + + svfloat32_t z401 = svmla_m( svptrue_b32(), z02, + svget2( zq51, 0 ), zbeta ); + svfloat32_t z501 = svmla_m( svptrue_b32(), z12, + svget2( zq61, 0 ), zbeta ); + svfloat32_t z601 = svmla_m( svptrue_b32(), z22, + svget2( zq51, 1 ), zbeta ); + svfloat32_t z701 = svmla_m( svptrue_b32(), z32, + svget2( zq61, 1 ), zbeta ); + + svfloat32x2_t z4001 = svcreate2( z401, z601 ); + svst1_f32_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 1 ) * rs_c], z4001 ); + + svfloat32x2_t z6001 = svcreate2( z501, z701 ); + svst1_f32_x2( svptrue_c32(), + &c_[result_tile_BL_corner + ( tcol + 1 ) * rs_c], z6001 ); + + svfloat32_t z03 = svread_hor_za32_m( z03, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 2 ); + svfloat32_t z13 = svread_hor_za32_m( z13, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 2 ); + svfloat32_t z23 = svread_hor_za32_m( z23, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 2 ); + svfloat32_t z33 = svread_hor_za32_m( z33, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 2 ); + + svfloat32_t z04 = svmul_f32_z( svptrue_b32(), z03, zalpha ); + svfloat32_t z14 = svmul_f32_z( svptrue_b32(), z13, zalpha ); + svfloat32_t z24 = svmul_f32_z( svptrue_b32(), z23, zalpha ); + svfloat32_t z34 = svmul_f32_z( svptrue_b32(), z33, zalpha ); + + svfloat32x2_t zq52 = svld1_f32_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 2 ) * rs_c ) )] ); + svfloat32x2_t zq62 = svld1_f32_x2( svptrue_c32(), + &c_[result_tile_BL_corner + ( ( ( tcol + 2 ) * rs_c ) )] ); + + svfloat32_t z402 = svmla_m( svptrue_b32(), z04, + svget2( zq52, 0 ), zbeta ); + svfloat32_t z502 = svmla_m( svptrue_b32(), z14, + svget2( zq62, 0 ), zbeta ); + svfloat32_t z602 = svmla_m( svptrue_b32(), z24, + svget2( zq52, 1 ), zbeta ); + svfloat32_t z702 = svmla_m( svptrue_b32(), z34, + svget2( zq62, 1 ), zbeta ); + + svfloat32x2_t z4002 = svcreate2( z402, z602 ); + svst1_f32_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 2 ) * rs_c], z4002 ); + + svfloat32x2_t z6002 = svcreate2( z502, z702 ); + svst1_f32_x2( svptrue_c32(), + &c_[result_tile_BL_corner + ( tcol + 2 ) * rs_c], z6002 ); + + svfloat32_t z05 = svread_hor_za32_m( z05, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 3 ); + svfloat32_t z15 = svread_hor_za32_m( z15, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 3 ); + svfloat32_t z25 = svread_hor_za32_m( z25, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 3 ); + svfloat32_t z35 = svread_hor_za32_m( z35, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 3 ); + + svfloat32_t z06 = svmul_f32_z( svptrue_b32(), z05, zalpha ); + svfloat32_t z16 = svmul_f32_z( svptrue_b32(), z15, zalpha ); + svfloat32_t z26 = svmul_f32_z( svptrue_b32(), z25, zalpha ); + svfloat32_t z36 = svmul_f32_z( svptrue_b32(), z35, zalpha ); + + svfloat32x2_t zq53 = svld1_f32_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 3 ) * rs_c ) )] ); + svfloat32x2_t zq63 = svld1_f32_x2( svptrue_c32(), + &c_[result_tile_BL_corner + ( ( ( tcol + 3 ) * rs_c ) )] ); + + svfloat32_t z403 = svmla_m( svptrue_b32(), z06, + svget2( zq53, 0 ), zbeta ); + svfloat32_t z503 = svmla_m( svptrue_b32(), z16, + svget2( zq63, 0 ), zbeta ); + svfloat32_t z603 = svmla_m( svptrue_b32(), z26, + svget2( zq53, 1 ), zbeta ); + svfloat32_t z703 = svmla_m( svptrue_b32(), z36, + svget2( zq63, 1 ), zbeta ); + + svfloat32x2_t z4003 = svcreate2( z403, z603 ); + svst1_f32_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 3 ) * rs_c], z4003 ); + + svfloat32x2_t z6003 = svcreate2( z503, z703 ); + svst1_f32_x2( svptrue_c32(), + &c_[result_tile_BL_corner + ( tcol + 3 ) * rs_c], z6003 ); + } + } + } + + GEMM_UKR_FLUSH_CT( s ); + + return; +} + +__arm_new( "za" ) __arm_locally_streaming void bli_dgemm_armsme_int_4SVLx2SVL + ( + dim_t m, + dim_t n, + dim_t k, + const void* alpha, + const void* a, + const void* b, + const void* beta, + void* c, inc_t rs_c, inc_t cs_c, + const auxinfo_t* data, + const cntx_t* cntx + ) +{ + uint64_t SVL = svcntsd(); + + GEMM_UKR_SETUP_CT_AMBI( d, 4 * SVL, 2 * SVL, false ); + + double *a_ = (double *)a; + double *b_ = (double *)b; + double *c_ = (double *)c; + + svzero_za(); + + uint64_t k_; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; + + for ( k_ = 0; k_ < k_iter; k_++ ) + { + // Loads + svfloat64x4_t zL00 = svld1_f64_x4( svptrue_c32(), + (float64_t *)( &a_[0] ) ); + svfloat64x2_t zR00 = svld1_f64_x2( svptrue_c32(), + (float64_t *)( &b_[0] ) ); + + svmopa_za64_m( 0, svptrue_b32(), svptrue_b32(), svget4( zL00, 0 ), + svget2( zR00, 0 ) ); + svmopa_za64_m( 1, svptrue_b32(), svptrue_b32(), svget4( zL00, 1 ), + svget2( zR00, 0 ) ); + + svmopa_za64_m( 2, svptrue_b32(), svptrue_b32(), svget4( zL00, 2 ), + svget2( zR00, 0 ) ); + svmopa_za64_m( 3, svptrue_b32(), svptrue_b32(), svget4( zL00, 3 ), + svget2( zR00, 0 ) ); + + svmopa_za64_m( 4, svptrue_b32(), svptrue_b32(), svget4( zL00, 0 ), + svget2( zR00, 1 ) ); + svmopa_za64_m( 5, svptrue_b32(), svptrue_b32(), svget4( zL00, 1 ), + svget2( zR00, 1 ) ); + + svmopa_za64_m( 6, svptrue_b32(), svptrue_b32(), svget4( zL00, 2 ), + svget2( zR00, 1 ) ); + svmopa_za64_m( 7, svptrue_b32(), svptrue_b32(), svget4( zL00, 3 ), + svget2( zR00, 1 ) ); + + svfloat64x4_t zL01 = svld1_f64_x4( svptrue_c32(), + (float64_t *)( &a_[( 4 * SVL )] ) ); + svfloat64x2_t zR01 = svld1_f64_x2( svptrue_c32(), + (float64_t *)( &b_[( 2 * SVL )] ) ); + + svmopa_za64_m( 0, svptrue_b32(), svptrue_b32(), svget4( zL01, 0 ), + svget2( zR01, 0 ) ); + svmopa_za64_m( 1, svptrue_b32(), svptrue_b32(), svget4( zL01, 1 ), + svget2( zR01, 0 ) ); + + svmopa_za64_m( 2, svptrue_b32(), svptrue_b32(), svget4( zL01, 2 ), + svget2( zR01, 0 ) ); + svmopa_za64_m( 3, svptrue_b32(), svptrue_b32(), svget4( zL01, 3 ), + svget2( zR01, 0 ) ); + + svmopa_za64_m( 4, svptrue_b32(), svptrue_b32(), svget4( zL01, 0 ), + svget2( zR01, 1 ) ); + svmopa_za64_m( 5, svptrue_b32(), svptrue_b32(), svget4( zL01, 1 ), + svget2( zR01, 1 ) ); + + svmopa_za64_m( 6, svptrue_b32(), svptrue_b32(), svget4( zL01, 2 ), + svget2( zR01, 1 ) ); + svmopa_za64_m( 7, svptrue_b32(), svptrue_b32(), svget4( zL01, 3 ), + svget2( zR01, 1 ) ); + + svfloat64x4_t zL02 = svld1_f64_x4( svptrue_c32(), + (float64_t *)( &a_[( 8 * SVL )] ) ); + svfloat64x2_t zR02 = svld1_f64_x2( svptrue_c32(), + (float64_t *)( &b_[( 4 * SVL )] ) ); + + svmopa_za64_m( 0, svptrue_b32(), svptrue_b32(), svget4( zL02, 0 ), + svget2( zR02, 0 ) ); + svmopa_za64_m( 1, svptrue_b32(), svptrue_b32(), svget4( zL02, 1 ), + svget2( zR02, 0 ) ); + + svmopa_za64_m( 2, svptrue_b32(), svptrue_b32(), svget4( zL02, 2 ), + svget2( zR02, 0 ) ); + svmopa_za64_m( 3, svptrue_b32(), svptrue_b32(), svget4( zL02, 3 ), + svget2( zR02, 0 ) ); + + svmopa_za64_m( 4, svptrue_b32(), svptrue_b32(), svget4( zL02, 0 ), + svget2( zR02, 1 ) ); + svmopa_za64_m( 5, svptrue_b32(), svptrue_b32(), svget4( zL02, 1 ), + svget2( zR02, 1 ) ); + + svmopa_za64_m( 6, svptrue_b32(), svptrue_b32(), svget4( zL02, 2 ), + svget2( zR02, 1 ) ); + svmopa_za64_m( 7, svptrue_b32(), svptrue_b32(), svget4( zL02, 3 ), + svget2( zR02, 1 ) ); + + svfloat64x4_t zL03 = svld1_f64_x4( svptrue_c32(), + (float64_t *)( &a_[( 12 * SVL )] ) ); + svfloat64x2_t zR03 = svld1_f64_x2( svptrue_c32(), + (float64_t *)( &b_[( 6 * SVL )] ) ); + + svmopa_za64_m( 0, svptrue_b32(), svptrue_b32(), svget4( zL03, 0 ), + svget2( zR03, 0 ) ); + svmopa_za64_m( 1, svptrue_b32(), svptrue_b32(), svget4( zL03, 1 ), + svget2( zR03, 0 ) ); + + svmopa_za64_m( 2, svptrue_b32(), svptrue_b32(), svget4( zL03, 2 ), + svget2( zR03, 0 ) ); + svmopa_za64_m( 3, svptrue_b32(), svptrue_b32(), svget4( zL03, 3 ), + svget2( zR03, 0 ) ); + + svmopa_za64_m( 4, svptrue_b32(), svptrue_b32(), svget4( zL03, 0 ), + svget2( zR03, 1 ) ); + svmopa_za64_m( 5, svptrue_b32(), svptrue_b32(), svget4( zL03, 1 ), + svget2( zR03, 1 ) ); + + svmopa_za64_m( 6, svptrue_b32(), svptrue_b32(), svget4( zL03, 2 ), + svget2( zR03, 1 ) ); + svmopa_za64_m( 7, svptrue_b32(), svptrue_b32(), svget4( zL03, 3 ), + svget2( zR03, 1 ) ); + + a_ += ( 2 * 8 * SVL ); + b_ += ( 8 * SVL ); + } + + for ( k_ = 0; k_ < k_left; k_ += 1 ) + { + svfloat64x4_t zL00 = svld1_f64_x4( svptrue_c32(), + (float64_t *)( &a_[0] ) ); + svfloat64x2_t zR00 = svld1_f64_x2( svptrue_c32(), + (float64_t *)( &b_[0] ) ); + + svmopa_za64_m( 0, svptrue_b32(), svptrue_b32(), svget4( zL00, 0 ), + svget2( zR00, 0 ) ); + svmopa_za64_m( 1, svptrue_b32(), svptrue_b32(), svget4( zL00, 1 ), + svget2( zR00, 0 ) ); + + svmopa_za64_m( 2, svptrue_b32(), svptrue_b32(), svget4( zL00, 2 ), + svget2( zR00, 0 ) ); + svmopa_za64_m( 3, svptrue_b32(), svptrue_b32(), svget4( zL00, 3 ), + svget2( zR00, 0 ) ); + + svmopa_za64_m( 4, svptrue_b32(), svptrue_b32(), svget4( zL00, 0 ), + svget2( zR00, 1 ) ); + svmopa_za64_m( 5, svptrue_b32(), svptrue_b32(), svget4( zL00, 1 ), + svget2( zR00, 1 ) ); + + svmopa_za64_m( 6, svptrue_b32(), svptrue_b32(), svget4( zL00, 2 ), + svget2( zR00, 1 ) ); + svmopa_za64_m( 7, svptrue_b32(), svptrue_b32(), svget4( zL00, 3 ), + svget2( zR00, 1 ) ); + + a_ += ( 4 * SVL ); + b_ += ( 2 * SVL ); + } + + double beta_ = *(double *)beta; + double alpha_ = *(double *)alpha; + + const uint64_t result_tile_TL_corner = 0; + + svfloat64_t zbeta = svdup_f64( beta_ ); + svfloat64_t zalpha = svdup_f64( alpha_ ); + + if ( rs_c == 1 ) + { + const uint64_t result_tile_TR_corner = SVL * cs_c; + + if ( beta_ == 0 ) + { + for ( uint64_t tcol = 0; tcol < SVL; tcol += 4 ) + { + // Read ZA slices into Z regs + svfloat64_t z0 = svread_ver_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 0 ); + svfloat64_t z1 = svread_ver_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 0 ); + svfloat64_t z2 = svread_ver_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 0 ); + svfloat64_t z3 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 0 ); + svfloat64_t z4 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 0 ); + svfloat64_t z5 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 0 ); + svfloat64_t z6 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 0 ); + svfloat64_t z7 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 0 ); + + // Scale Z regs by broadcast alpha + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + // Store full result into C + svfloat64x4_t z400 = svcreate4( z0, z1, z2, z3 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 0 ) * cs_c], z400 ); + + svfloat64x4_t z600 = svcreate4( z4, z5, z6, z7 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 0 ) * cs_c], z600 ); + + // tcol + 1 + z0 = svread_ver_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 1 ); + z1 = svread_ver_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 1 ); + z2 = svread_ver_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 1 ); + z3 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 1 ); + z4 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 1 ); + z5 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 1 ); + z6 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 1 ); + z7 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 1 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + z400 = svcreate4( z0, z1, z2, z3 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 1 ) * cs_c], z400 ); + + z600 = svcreate4( z4, z5, z6, z7 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 1 ) * cs_c], z600 ); + + // tcol + 2 + z0 = svread_ver_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 2 ); + z1 = svread_ver_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 2 ); + z2 = svread_ver_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 2 ); + z3 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 2 ); + z4 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 2 ); + z5 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 2 ); + z6 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 2 ); + z7 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 2 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + z400 = svcreate4( z0, z1, z2, z3 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 2 ) * cs_c], z400 ); + + z600 = svcreate4( z4, z5, z6, z7 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 2 ) * cs_c], z600 ); + + // tcol + 3 + z0 = svread_ver_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 3 ); + z1 = svread_ver_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 3 ); + z2 = svread_ver_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 3 ); + z3 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 3 ); + z4 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 3 ); + z5 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 3 ); + z6 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 3 ); + z7 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 3 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + z400 = svcreate4( z0, z1, z2, z3 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 3 ) * cs_c], z400 ); + + z600 = svcreate4( z4, z5, z6, z7 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 3 ) * cs_c], z600 ); + } + } + else + { + for ( uint64_t tcol = 0; tcol < SVL; tcol += 4 ) + { + // Read ZA slices into Z regs + svfloat64_t z0 = svread_ver_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 0 ); + svfloat64_t z1 = svread_ver_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 0 ); + svfloat64_t z2 = svread_ver_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 0 ); + svfloat64_t z3 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 0 ); + svfloat64_t z4 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 0 ); + svfloat64_t z5 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 0 ); + svfloat64_t z6 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 0 ); + svfloat64_t z7 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 0 ); + + // Scale Z regs by broadcast alpha + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + // Load C into Z regs + svfloat64x4_t zq5 = svld1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 0 ) * cs_c ) )] ); + svfloat64x4_t zq6 = svld1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( ( ( tcol + 0 ) * cs_c ) )] ); + + // Scale Z regs by broadcast beta + svfloat64_t z40 = svmla_m( svptrue_b32(), z0, svget4( zq5, 0 ), + zbeta ); + svfloat64_t z50 = svmla_m( svptrue_b32(), z1, svget4( zq5, 1 ), + zbeta ); + svfloat64_t z60 = svmla_m( svptrue_b32(), z2, svget4( zq5, 2 ), + zbeta ); + svfloat64_t z70 = svmla_m( svptrue_b32(), z3, svget4( zq5, 3 ), + zbeta ); + svfloat64_t z80 = svmla_m( svptrue_b32(), z4, svget4( zq6, 0 ), + zbeta ); + svfloat64_t z90 = svmla_m( svptrue_b32(), z5, svget4( zq6, 1 ), + zbeta ); + svfloat64_t za0 = svmla_m( svptrue_b32(), z6, svget4( zq6, 2 ), + zbeta ); + svfloat64_t zb0 = svmla_m( svptrue_b32(), z7, svget4( zq6, 3 ), + zbeta ); + + // Store full result into C + svfloat64x4_t z400 = svcreate4( z40, z50, z60, z70 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 0 ) * cs_c], z400 ); + + svfloat64x4_t z600 = svcreate4( z80, z90, za0, zb0 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 0 ) * cs_c], z600 ); + + // tcol + 1 + z0 = svread_ver_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 1 ); + z1 = svread_ver_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 1 ); + z2 = svread_ver_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 1 ); + z3 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 1 ); + z4 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 1 ); + z5 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 1 ); + z6 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 1 ); + z7 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 1 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + zq5 = svld1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 1 ) * cs_c ) )] ); + zq6 = svld1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( ( ( tcol + 1 ) * cs_c ) )] ); + + z40 = svmla_m( svptrue_b32(), z0, svget4( zq5, 0 ), zbeta ); + z50 = svmla_m( svptrue_b32(), z1, svget4( zq5, 1 ), zbeta ); + z60 = svmla_m( svptrue_b32(), z2, svget4( zq5, 2 ), zbeta ); + z70 = svmla_m( svptrue_b32(), z3, svget4( zq5, 3 ), zbeta ); + z80 = svmla_m( svptrue_b32(), z4, svget4( zq6, 0 ), zbeta ); + z90 = svmla_m( svptrue_b32(), z5, svget4( zq6, 1 ), zbeta ); + za0 = svmla_m( svptrue_b32(), z6, svget4( zq6, 2 ), zbeta ); + zb0 = svmla_m( svptrue_b32(), z7, svget4( zq6, 3 ), zbeta ); + + z400 = svcreate4( z40, z50, z60, z70 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 1 ) * cs_c], z400 ); + + z600 = svcreate4( z80, z90, za0, zb0 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 1 ) * cs_c], z600 ); + + // tcol + 2 + z0 = svread_ver_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 2 ); + z1 = svread_ver_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 2 ); + z2 = svread_ver_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 2 ); + z3 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 2 ); + z4 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 2 ); + z5 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 2 ); + z6 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 2 ); + z7 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 2 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + zq5 = svld1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 2 ) * cs_c ) )] ); + zq6 = svld1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( ( ( tcol + 2 ) * cs_c ) )] ); + + z40 = svmla_m( svptrue_b32(), z0, svget4( zq5, 0 ), zbeta ); + z50 = svmla_m( svptrue_b32(), z1, svget4( zq5, 1 ), zbeta ); + z60 = svmla_m( svptrue_b32(), z2, svget4( zq5, 2 ), zbeta ); + z70 = svmla_m( svptrue_b32(), z3, svget4( zq5, 3 ), zbeta ); + z80 = svmla_m( svptrue_b32(), z4, svget4( zq6, 0 ), zbeta ); + z90 = svmla_m( svptrue_b32(), z5, svget4( zq6, 1 ), zbeta ); + za0 = svmla_m( svptrue_b32(), z6, svget4( zq6, 2 ), zbeta ); + zb0 = svmla_m( svptrue_b32(), z7, svget4( zq6, 3 ), zbeta ); + + z400 = svcreate4( z40, z50, z60, z70 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 2 ) * cs_c], z400 ); + + z600 = svcreate4( z80, z90, za0, zb0 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 2 ) * cs_c], z600 ); + + // tcol + 3 + z0 = svread_ver_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 3 ); + z1 = svread_ver_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 3 ); + z2 = svread_ver_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 3 ); + z3 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 3 ); + z4 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 3 ); + z5 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 3 ); + z6 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 3 ); + z7 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 3 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + zq5 = svld1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 3 ) * cs_c ) )] ); + zq6 = svld1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( ( ( tcol + 3 ) * cs_c ) )] ); + + z40 = svmla_m( svptrue_b32(), z0, svget4( zq5, 0 ), zbeta ); + z50 = svmla_m( svptrue_b32(), z1, svget4( zq5, 1 ), zbeta ); + z60 = svmla_m( svptrue_b32(), z2, svget4( zq5, 2 ), zbeta ); + z70 = svmla_m( svptrue_b32(), z3, svget4( zq5, 3 ), zbeta ); + z80 = svmla_m( svptrue_b32(), z4, svget4( zq6, 0 ), zbeta ); + z90 = svmla_m( svptrue_b32(), z5, svget4( zq6, 1 ), zbeta ); + za0 = svmla_m( svptrue_b32(), z6, svget4( zq6, 2 ), zbeta ); + zb0 = svmla_m( svptrue_b32(), z7, svget4( zq6, 3 ), zbeta ); + + z400 = svcreate4( z40, z50, z60, z70 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 3 ) * cs_c], z400 ); + + z600 = svcreate4( z80, z90, za0, zb0 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 3 ) * cs_c], z600 ); + } + } + } + else + { + const uint64_t result_tile_TR_corner = SVL * rs_c; + const uint64_t result_tile_BL_corner = SVL * 2 * rs_c; + const uint64_t result_tile_BR_corner = SVL * 3 * rs_c; + + if ( beta_ == 0 ) + { + for ( uint64_t tcol = 0; tcol < SVL; tcol += 4 ) + { + // Read ZA slices into Z regs + svfloat64_t z0 = svread_hor_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 0 ); + svfloat64_t z1 = svread_hor_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 0 ); + svfloat64_t z2 = svread_hor_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 0 ); + svfloat64_t z3 = svread_hor_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 0 ); + svfloat64_t z4 = svread_hor_za64_m( z4, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 0 ); + svfloat64_t z5 = svread_hor_za64_m( z5, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 0 ); + svfloat64_t z6 = svread_hor_za64_m( z6, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 0 ); + svfloat64_t z7 = svread_hor_za64_m( z7, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 0 ); + + // Scale Z regs by broadcast alpha + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + // Store full result into C + svfloat64x2_t z400 = svcreate2( z0, z4 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 0 ) * rs_c], z400 ); + + svfloat64x2_t z600 = svcreate2( z1, z5 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 0 ) * rs_c], z600 ); + svfloat64x2_t z700 = svcreate2( z2, z6 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_BL_corner + ( tcol + 0 ) * rs_c], z700 ); + + svfloat64x2_t z800 = svcreate2( z3, z7 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_BR_corner + ( tcol + 0 ) * rs_c], z800 ); + + // tcol + 1 + z0 = svread_hor_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 1 ); + z1 = svread_hor_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 1 ); + z2 = svread_hor_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 1 ); + z3 = svread_hor_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 1 ); + z4 = svread_hor_za64_m( z4, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 1 ); + z5 = svread_hor_za64_m( z5, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 1 ); + z6 = svread_hor_za64_m( z6, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 1 ); + z7 = svread_hor_za64_m( z7, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 1 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + z400 = svcreate2( z0, z4 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 1 ) * rs_c], z400 ); + + z600 = svcreate2( z1, z5 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 1 ) * rs_c], z600 ); + + z700 = svcreate2( z2, z6 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_BL_corner + ( tcol + 1 ) * rs_c], z700 ); + + z800 = svcreate2( z3, z7 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_BR_corner + ( tcol + 1 ) * rs_c], z800 ); + + // tcol + 2 + z0 = svread_hor_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 2 ); + z1 = svread_hor_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 2 ); + z2 = svread_hor_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 2 ); + z3 = svread_hor_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 2 ); + z4 = svread_hor_za64_m( z4, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 2 ); + z5 = svread_hor_za64_m( z5, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 2 ); + z6 = svread_hor_za64_m( z6, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 2 ); + z7 = svread_hor_za64_m( z7, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 2 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + z400 = svcreate2( z0, z4 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 2 ) * rs_c], z400 ); + + z600 = svcreate2( z1, z5 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 2 ) * rs_c], z600 ); + + z700 = svcreate2( z2, z6 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_BL_corner + ( tcol + 2 ) * rs_c], z700 ); + + z800 = svcreate2( z3, z7 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_BR_corner + ( tcol + 2 ) * rs_c], z800 ); + + // tcol + 3 + z0 = svread_hor_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 3 ); + z1 = svread_hor_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 3 ); + z2 = svread_hor_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 3 ); + z3 = svread_hor_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 3 ); + z4 = svread_hor_za64_m( z4, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 3 ); + z5 = svread_hor_za64_m( z5, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 3 ); + z6 = svread_hor_za64_m( z6, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 3 ); + z7 = svread_hor_za64_m( z7, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 3 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + z400 = svcreate2( z0, z4 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 3 ) * rs_c], z400 ); + + z600 = svcreate2( z1, z5 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 3 ) * rs_c], z600 ); + + z700 = svcreate2( z2, z6 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_BL_corner + ( tcol + 3 ) * rs_c], z700 ); + + z800 = svcreate2( z3, z7 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_BR_corner + ( tcol + 3 ) * rs_c], z800 ); + } + } + else + { + for ( uint64_t tcol = 0; tcol < SVL; tcol += 4 ) + { + // Read ZA slices into Z regs + svfloat64_t z0 = svread_hor_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 0 ); + svfloat64_t z1 = svread_hor_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 0 ); + svfloat64_t z2 = svread_hor_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 0 ); + svfloat64_t z3 = svread_hor_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 0 ); + svfloat64_t z4 = svread_hor_za64_m( z4, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 0 ); + svfloat64_t z5 = svread_hor_za64_m( z5, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 0 ); + svfloat64_t z6 = svread_hor_za64_m( z6, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 0 ); + svfloat64_t z7 = svread_hor_za64_m( z7, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 0 ); + + // Scale Z regs by broadcast alpha + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + // Load C into Z regs + svfloat64x2_t zq5 = svld1_f64_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 0 ) * rs_c ) )] ); + svfloat64x2_t zq6 = svld1_f64_x2( svptrue_c32(), + &c_[result_tile_TR_corner + ( ( ( tcol + 0 ) * rs_c ) )] ); + svfloat64x2_t zq7 = svld1_f64_x2( svptrue_c32(), + &c_[result_tile_BL_corner + ( ( ( tcol + 0 ) * rs_c ) )] ); + svfloat64x2_t zq8 = svld1_f64_x2( svptrue_c32(), + &c_[result_tile_BR_corner + ( ( ( tcol + 0 ) * rs_c ) )] ); + + // Scale Z regs by broadcast beta (reordered ZA tiles to match + // horizontal order) + svfloat64_t z40 = svmla_m( svptrue_b32(), z0, svget2( zq5, 0 ), + zbeta ); + svfloat64_t z50 = svmla_m( svptrue_b32(), z4, svget2( zq5, 1 ), + zbeta ); + svfloat64_t z60 = svmla_m( svptrue_b32(), z1, svget2( zq6, 0 ), + zbeta ); + svfloat64_t z70 = svmla_m( svptrue_b32(), z5, svget2( zq6, 1 ), + zbeta ); + svfloat64_t z80 = svmla_m( svptrue_b32(), z2, svget2( zq7, 0 ), + zbeta ); + svfloat64_t z90 = svmla_m( svptrue_b32(), z6, svget2( zq7, 1 ), + zbeta ); + svfloat64_t za0 = svmla_m( svptrue_b32(), z3, svget2( zq8, 0 ), + zbeta ); + svfloat64_t zb0 = svmla_m( svptrue_b32(), z7, svget2( zq8, 1 ), + zbeta ); + + // Store full result into C + svfloat64x2_t z400 = svcreate2( z40, z50 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 0 ) * rs_c], z400 ); + + svfloat64x2_t z600 = svcreate2( z60, z70 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 0 ) * rs_c], z600 ); + + svfloat64x2_t z700 = svcreate2( z80, z90 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_BL_corner + ( tcol + 0 ) * rs_c], z700 ); + + svfloat64x2_t z800 = svcreate2( za0, zb0 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_BR_corner + ( tcol + 0 ) * rs_c], z800 ); + + // tcol + 1 + z0 = svread_hor_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 1 ); + z1 = svread_hor_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 1 ); + z2 = svread_hor_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 1 ); + z3 = svread_hor_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 1 ); + z4 = svread_hor_za64_m( z4, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 1 ); + z5 = svread_hor_za64_m( z5, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 1 ); + z6 = svread_hor_za64_m( z6, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 1 ); + z7 = svread_hor_za64_m( z7, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 1 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + zq5 = svld1_f64_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 1 ) * rs_c ) )] ); + zq6 = svld1_f64_x2( svptrue_c32(), + &c_[result_tile_TR_corner + ( ( ( tcol + 1 ) * rs_c ) )] ); + zq7 = svld1_f64_x2( svptrue_c32(), + &c_[result_tile_BL_corner + ( ( ( tcol + 1 ) * rs_c ) )] ); + zq8 = svld1_f64_x2( svptrue_c32(), + &c_[result_tile_BR_corner + ( ( ( tcol + 1 ) * rs_c ) )] ); + + z40 = svmla_m( svptrue_b32(), z0, svget2( zq5, 0 ), zbeta ); + z50 = svmla_m( svptrue_b32(), z4, svget2( zq5, 1 ), zbeta ); + z60 = svmla_m( svptrue_b32(), z1, svget2( zq6, 0 ), zbeta ); + z70 = svmla_m( svptrue_b32(), z5, svget2( zq6, 1 ), zbeta ); + z80 = svmla_m( svptrue_b32(), z2, svget2( zq7, 0 ), zbeta ); + z90 = svmla_m( svptrue_b32(), z6, svget2( zq7, 1 ), zbeta ); + za0 = svmla_m( svptrue_b32(), z3, svget2( zq8, 0 ), zbeta ); + zb0 = svmla_m( svptrue_b32(), z7, svget2( zq8, 1 ), zbeta ); + + z400 = svcreate2( z40, z50 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 1 ) * rs_c], z400 ); + + z600 = svcreate2( z60, z70 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 1 ) * rs_c], z600 ); + + z700 = svcreate2( z80, z90 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_BL_corner + ( tcol + 1 ) * rs_c], z700 ); + + z800 = svcreate2( za0, zb0 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_BR_corner + ( tcol + 1 ) * rs_c], z800 ); + + // tcol + 2 + z0 = svread_hor_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 2 ); + z1 = svread_hor_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 2 ); + z2 = svread_hor_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 2 ); + z3 = svread_hor_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 2 ); + z4 = svread_hor_za64_m( z4, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 2 ); + z5 = svread_hor_za64_m( z5, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 2 ); + z6 = svread_hor_za64_m( z6, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 2 ); + z7 = svread_hor_za64_m( z7, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 2 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + zq5 = svld1_f64_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 2 ) * rs_c ) )] ); + zq6 = svld1_f64_x2( svptrue_c32(), + &c_[result_tile_TR_corner + ( ( ( tcol + 2 ) * rs_c ) )] ); + zq7 = svld1_f64_x2( svptrue_c32(), + &c_[result_tile_BL_corner + ( ( ( tcol + 2 ) * rs_c ) )] ); + zq8 = svld1_f64_x2( svptrue_c32(), + &c_[result_tile_BR_corner + ( ( ( tcol + 2 ) * rs_c ) )] ); + + z40 = svmla_m( svptrue_b32(), z0, svget2( zq5, 0 ), zbeta ); + z50 = svmla_m( svptrue_b32(), z4, svget2( zq5, 1 ), zbeta ); + z60 = svmla_m( svptrue_b32(), z1, svget2( zq6, 0 ), zbeta ); + z70 = svmla_m( svptrue_b32(), z5, svget2( zq6, 1 ), zbeta ); + z80 = svmla_m( svptrue_b32(), z2, svget2( zq7, 0 ), zbeta ); + z90 = svmla_m( svptrue_b32(), z6, svget2( zq7, 1 ), zbeta ); + za0 = svmla_m( svptrue_b32(), z3, svget2( zq8, 0 ), zbeta ); + zb0 = svmla_m( svptrue_b32(), z7, svget2( zq8, 1 ), zbeta ); + + z400 = svcreate2( z40, z50 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 2 ) * rs_c], z400 ); + + z600 = svcreate2( z60, z70 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 2 ) * rs_c], z600 ); + + z700 = svcreate2( z80, z90 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_BL_corner + ( tcol + 2 ) * rs_c], z700 ); + + z800 = svcreate2( za0, zb0 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_BR_corner + ( tcol + 2 ) * rs_c], z800 ); + + // tcol + 3 + z0 = svread_hor_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 3 ); + z1 = svread_hor_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 3 ); + z2 = svread_hor_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 3 ); + z3 = svread_hor_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 3 ); + z4 = svread_hor_za64_m( z4, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 3 ); + z5 = svread_hor_za64_m( z5, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 3 ); + z6 = svread_hor_za64_m( z6, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 3 ); + z7 = svread_hor_za64_m( z7, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 3 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + zq5 = svld1_f64_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 3 ) * rs_c ) )] ); + zq6 = svld1_f64_x2( svptrue_c32(), + &c_[result_tile_TR_corner + ( ( ( tcol + 3 ) * rs_c ) )] ); + zq7 = svld1_f64_x2( svptrue_c32(), + &c_[result_tile_BL_corner + ( ( ( tcol + 3 ) * rs_c ) )] ); + zq8 = svld1_f64_x2( svptrue_c32(), + &c_[result_tile_BR_corner + ( ( ( tcol + 3 ) * rs_c ) )] ); + + z40 = svmla_m( svptrue_b32(), z0, svget2( zq5, 0 ), zbeta ); + z50 = svmla_m( svptrue_b32(), z4, svget2( zq5, 1 ), zbeta ); + z60 = svmla_m( svptrue_b32(), z1, svget2( zq6, 0 ), zbeta ); + z70 = svmla_m( svptrue_b32(), z5, svget2( zq6, 1 ), zbeta ); + z80 = svmla_m( svptrue_b32(), z2, svget2( zq7, 0 ), zbeta ); + z90 = svmla_m( svptrue_b32(), z6, svget2( zq7, 1 ), zbeta ); + za0 = svmla_m( svptrue_b32(), z3, svget2( zq8, 0 ), zbeta ); + zb0 = svmla_m( svptrue_b32(), z7, svget2( zq8, 1 ), zbeta ); + + z400 = svcreate2( z40, z50 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 3 ) * rs_c], z400 ); + + z600 = svcreate2( z60, z70 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 3 ) * rs_c], z600 ); + + z700 = svcreate2( z80, z90 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_BL_corner + ( tcol + 3 ) * rs_c], z700 ); + + z800 = svcreate2( za0, zb0 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_BR_corner + ( tcol + 3 ) * rs_c], z800 ); + } + } + } + + GEMM_UKR_FLUSH_CT( d ); + + return; +} + + + diff --git a/kernels/armsme/3/bli_gemm_armsme_int_d2SVLx4SVL.c b/kernels/armsme/3/bli_gemm_armsme_int_d2SVLx4SVL.c new file mode 100644 index 000000000..dc9cdc2f8 --- /dev/null +++ b/kernels/armsme/3/bli_gemm_armsme_int_d2SVLx4SVL.c @@ -0,0 +1,1030 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2021, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are +met: +- Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name(s) of the copyright holder(s) nor the names of its +contributors may be used to endorse or promote products derived +from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + */ + +#include +#include + +#include "blis.h" + +__arm_new( "za" ) __arm_locally_streaming void bli_dgemm_armsme_int_2SVLx4SVL + ( + dim_t m, + dim_t n, + dim_t k, + const void* alpha, + const void* a, + const void* b, + const void* beta, + void* c, inc_t rs_c, inc_t cs_c, + const auxinfo_t* data, + const cntx_t* cntx + ) +{ + uint64_t SVL = svcntsd(); + + GEMM_UKR_SETUP_CT_AMBI( d, 2 * SVL, 4 * SVL, false ); + + double* a_ = (double*)a; + double* b_ = (double*)b; + double* c_ = (double*)c; + + svzero_za(); + + uint64_t k_; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; + + for ( k_ = 0; k_ < k_iter; k_++ ) + { + // Loads + svfloat64x4_t zL00 = svld1_f64_x4( svptrue_c32(), + (float64_t*)( &b_[0] ) ); + svfloat64x2_t zR00 = svld1_f64_x2( svptrue_c32(), + (float64_t*)( &a_[0] ) ); + + svmopa_za64_m( 0, svptrue_b32(), svptrue_b32(), svget4( zL00, 0 ), + svget2( zR00, 0 ) ); + svmopa_za64_m( 1, svptrue_b32(), svptrue_b32(), svget4( zL00, 1 ), + svget2( zR00, 0 ) ); + + svmopa_za64_m( 2, svptrue_b32(), svptrue_b32(), svget4( zL00, 2 ), + svget2( zR00, 0 ) ); + svmopa_za64_m( 3, svptrue_b32(), svptrue_b32(), svget4( zL00, 3 ), + svget2( zR00, 0 ) ); + + svmopa_za64_m( 4, svptrue_b32(), svptrue_b32(), svget4( zL00, 0 ), + svget2( zR00, 1 ) ); + svmopa_za64_m( 5, svptrue_b32(), svptrue_b32(), svget4( zL00, 1 ), + svget2( zR00, 1 ) ); + + svmopa_za64_m( 6, svptrue_b32(), svptrue_b32(), svget4( zL00, 2 ), + svget2( zR00, 1 ) ); + svmopa_za64_m( 7, svptrue_b32(), svptrue_b32(), svget4( zL00, 3 ), + svget2( zR00, 1 ) ); + + svfloat64x4_t zL01 = svld1_f64_x4( svptrue_c32(), + (float64_t*)( &b_[( 4 * SVL )] ) ); + svfloat64x2_t zR01 = svld1_f64_x2( svptrue_c32(), + (float64_t*)( &a_[( 2 * SVL )] ) ); + + svmopa_za64_m( 0, svptrue_b32(), svptrue_b32(), svget4( zL01, 0 ), + svget2( zR01, 0 ) ); + svmopa_za64_m( 1, svptrue_b32(), svptrue_b32(), svget4( zL01, 1 ), + svget2( zR01, 0 ) ); + + svmopa_za64_m( 2, svptrue_b32(), svptrue_b32(), svget4( zL01, 2 ), + svget2( zR01, 0 ) ); + svmopa_za64_m( 3, svptrue_b32(), svptrue_b32(), svget4( zL01, 3 ), + svget2( zR01, 0 ) ); + + svmopa_za64_m( 4, svptrue_b32(), svptrue_b32(), svget4( zL01, 0 ), + svget2( zR01, 1 ) ); + svmopa_za64_m( 5, svptrue_b32(), svptrue_b32(), svget4( zL01, 1 ), + svget2( zR01, 1 ) ); + + svmopa_za64_m( 6, svptrue_b32(), svptrue_b32(), svget4( zL01, 2 ), + svget2( zR01, 1 ) ); + svmopa_za64_m( 7, svptrue_b32(), svptrue_b32(), svget4( zL01, 3 ), + svget2( zR01, 1 ) ); + + svfloat64x4_t zL02 = svld1_f64_x4( svptrue_c32(), + (float64_t*)( &b_[( 8 * SVL )] ) ); + svfloat64x2_t zR02 = svld1_f64_x2( svptrue_c32(), + (float64_t*)( &a_[( 4 * SVL )] ) ); + + svmopa_za64_m( 0, svptrue_b32(), svptrue_b32(), svget4( zL02, 0 ), + svget2( zR02, 0 ) ); + svmopa_za64_m( 1, svptrue_b32(), svptrue_b32(), svget4( zL02, 1 ), + svget2( zR02, 0 ) ); + + svmopa_za64_m( 2, svptrue_b32(), svptrue_b32(), svget4( zL02, 2 ), + svget2( zR02, 0 ) ); + svmopa_za64_m( 3, svptrue_b32(), svptrue_b32(), svget4( zL02, 3 ), + svget2( zR02, 0 ) ); + + svmopa_za64_m( 4, svptrue_b32(), svptrue_b32(), svget4( zL02, 0 ), + svget2( zR02, 1 ) ); + svmopa_za64_m( 5, svptrue_b32(), svptrue_b32(), svget4( zL02, 1 ), + svget2( zR02, 1 ) ); + + svmopa_za64_m( 6, svptrue_b32(), svptrue_b32(), svget4( zL02, 2 ), + svget2( zR02, 1 ) ); + svmopa_za64_m( 7, svptrue_b32(), svptrue_b32(), svget4( zL02, 3 ), + svget2( zR02, 1 ) ); + + svfloat64x4_t zL03 = svld1_f64_x4( svptrue_c32(), + (float64_t*)( &b_[( 12 * SVL )] ) ); + svfloat64x2_t zR03 = svld1_f64_x2( svptrue_c32(), + (float64_t*)( &a_[( 6 * SVL )] ) ); + + svmopa_za64_m( 0, svptrue_b32(), svptrue_b32(), svget4( zL03, 0 ), + svget2( zR03, 0 ) ); + svmopa_za64_m( 1, svptrue_b32(), svptrue_b32(), svget4( zL03, 1 ), + svget2( zR03, 0 ) ); + + svmopa_za64_m( 2, svptrue_b32(), svptrue_b32(), svget4( zL03, 2 ), + svget2( zR03, 0 ) ); + svmopa_za64_m( 3, svptrue_b32(), svptrue_b32(), svget4( zL03, 3 ), + svget2( zR03, 0 ) ); + + svmopa_za64_m( 4, svptrue_b32(), svptrue_b32(), svget4( zL03, 0 ), + svget2( zR03, 1 ) ); + svmopa_za64_m( 5, svptrue_b32(), svptrue_b32(), svget4( zL03, 1 ), + svget2( zR03, 1 ) ); + + svmopa_za64_m( 6, svptrue_b32(), svptrue_b32(), svget4( zL03, 2 ), + svget2( zR03, 1 ) ); + svmopa_za64_m( 7, svptrue_b32(), svptrue_b32(), svget4( zL03, 3 ), + svget2( zR03, 1 ) ); + + a_ += ( 8 * SVL ); + b_ += ( 2 * 8 * SVL ); + } + + for ( k_ = 0; k_ < k_left; k_ += 1 ) + { + svfloat64x4_t zL00 = svld1_f64_x4( svptrue_c32(), + (float64_t*)( &b_[0] ) ); + svfloat64x2_t zR00 = svld1_f64_x2( svptrue_c32(), + (float64_t*)( &a_[0] ) ); + + svmopa_za64_m( 0, svptrue_b32(), svptrue_b32(), svget4( zL00, 0 ), + svget2( zR00, 0 ) ); + svmopa_za64_m( 1, svptrue_b32(), svptrue_b32(), svget4( zL00, 1 ), + svget2( zR00, 0 ) ); + + svmopa_za64_m( 2, svptrue_b32(), svptrue_b32(), svget4( zL00, 2 ), + svget2( zR00, 0 ) ); + svmopa_za64_m( 3, svptrue_b32(), svptrue_b32(), svget4( zL00, 3 ), + svget2( zR00, 0 ) ); + + svmopa_za64_m( 4, svptrue_b32(), svptrue_b32(), svget4( zL00, 0 ), + svget2( zR00, 1 ) ); + svmopa_za64_m( 5, svptrue_b32(), svptrue_b32(), svget4( zL00, 1 ), + svget2( zR00, 1 ) ); + + svmopa_za64_m( 6, svptrue_b32(), svptrue_b32(), svget4( zL00, 2 ), + svget2( zR00, 1 ) ); + svmopa_za64_m( 7, svptrue_b32(), svptrue_b32(), svget4( zL00, 3 ), + svget2( zR00, 1 ) ); + + a_ += ( 2 * SVL ); + b_ += ( 4 * SVL ); + } + + double beta_ = *(double*)beta; + double alpha_ = *(double*)alpha; + + const uint64_t result_tile_TL_corner = 0; + + svfloat64_t zbeta = svdup_f64( beta_ ); + svfloat64_t zalpha = svdup_f64( alpha_ ); + + if ( cs_c == 1 ) + { + const uint64_t result_tile_TR_corner = SVL * rs_c; + + if ( beta_ == 0 ) + { + for ( uint64_t tcol = 0; tcol < SVL; tcol += 4 ) + { + // Read ZA slices into Z regs + svfloat64_t z0 = svread_ver_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 0 ); + svfloat64_t z1 = svread_ver_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 0 ); + svfloat64_t z2 = svread_ver_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 0 ); + svfloat64_t z3 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 0 ); + svfloat64_t z4 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 0 ); + svfloat64_t z5 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 0 ); + svfloat64_t z6 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 0 ); + svfloat64_t z7 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 0 ); + + // Scale Z regs by broadcast alpha + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + // Store full result into C + svfloat64x4_t z400 = svcreate4( z0, z1, z2, z3 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 0 ) * rs_c], z400 ); + + svfloat64x4_t z600 = svcreate4( z4, z5, z6, z7 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 0 ) * rs_c], z600 ); + + // tcol + 1 + z0 = svread_ver_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 1 ); + z1 = svread_ver_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 1 ); + z2 = svread_ver_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 1 ); + z3 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 1 ); + z4 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 1 ); + z5 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 1 ); + z6 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 1 ); + z7 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 1 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + z400 = svcreate4( z0, z1, z2, z3 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 1 ) * rs_c], z400 ); + + z600 = svcreate4( z4, z5, z6, z7 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 1 ) * rs_c], z600 ); + + // tcol + 2 + z0 = svread_ver_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 2 ); + z1 = svread_ver_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 2 ); + z2 = svread_ver_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 2 ); + z3 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 2 ); + z4 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 2 ); + z5 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 2 ); + z6 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 2 ); + z7 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 2 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + z400 = svcreate4( z0, z1, z2, z3 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 2 ) * rs_c], z400 ); + + z600 = svcreate4( z4, z5, z6, z7 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 2 ) * rs_c], z600 ); + + // tcol + 3 + z0 = svread_ver_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 3 ); + z1 = svread_ver_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 3 ); + z2 = svread_ver_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 3 ); + z3 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 3 ); + z4 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 3 ); + z5 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 3 ); + z6 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 3 ); + z7 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 3 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + z400 = svcreate4( z0, z1, z2, z3 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 3 ) * rs_c], z400 ); + + z600 = svcreate4( z4, z5, z6, z7 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 3 ) * rs_c], z600 ); + } + } + else + { + for ( uint64_t tcol = 0; tcol < SVL; tcol += 4 ) + { + // Read ZA slices into Z regs + svfloat64_t z0 = svread_ver_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 0 ); + svfloat64_t z1 = svread_ver_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 0 ); + svfloat64_t z2 = svread_ver_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 0 ); + svfloat64_t z3 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 0 ); + svfloat64_t z4 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 0 ); + svfloat64_t z5 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 0 ); + svfloat64_t z6 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 0 ); + svfloat64_t z7 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 0 ); + + // Scale Z regs by broadcast alpha + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + // Load C into Z regs + svfloat64x4_t zq5 = svld1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 0 ) * rs_c ) )] ); + svfloat64x4_t zq6 = svld1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( ( ( tcol + 0 ) * rs_c ) )] ); + + // Scale Z regs by broadcast beta + svfloat64_t z40 = svmla_m( svptrue_b32(), z0, svget4( zq5, 0 ), + zbeta ); + svfloat64_t z50 = svmla_m( svptrue_b32(), z1, svget4( zq5, 1 ), + zbeta ); + svfloat64_t z60 = svmla_m( svptrue_b32(), z2, svget4( zq5, 2 ), + zbeta ); + svfloat64_t z70 = svmla_m( svptrue_b32(), z3, svget4( zq5, 3 ), + zbeta ); + svfloat64_t z80 = svmla_m( svptrue_b32(), z4, svget4( zq6, 0 ), + zbeta ); + svfloat64_t z90 = svmla_m( svptrue_b32(), z5, svget4( zq6, 1 ), + zbeta ); + svfloat64_t za0 = svmla_m( svptrue_b32(), z6, svget4( zq6, 2 ), + zbeta ); + svfloat64_t zb0 = svmla_m( svptrue_b32(), z7, svget4( zq6, 3 ), + zbeta ); + + // Store full result into C + svfloat64x4_t z400 = svcreate4( z40, z50, z60, z70 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 0 ) * rs_c], z400 ); + + svfloat64x4_t z600 = svcreate4( z80, z90, za0, zb0 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 0 ) * rs_c], z600 ); + + // tcol + 1 + z0 = svread_ver_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 1 ); + z1 = svread_ver_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 1 ); + z2 = svread_ver_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 1 ); + z3 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 1 ); + z4 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 1 ); + z5 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 1 ); + z6 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 1 ); + z7 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 1 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + zq5 = svld1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 1 ) * rs_c ) )] ); + zq6 = svld1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( ( ( tcol + 1 ) * rs_c ) )] ); + + z40 = svmla_m( svptrue_b32(), z0, svget4( zq5, 0 ), zbeta ); + z50 = svmla_m( svptrue_b32(), z1, svget4( zq5, 1 ), zbeta ); + z60 = svmla_m( svptrue_b32(), z2, svget4( zq5, 2 ), zbeta ); + z70 = svmla_m( svptrue_b32(), z3, svget4( zq5, 3 ), zbeta ); + z80 = svmla_m( svptrue_b32(), z4, svget4( zq6, 0 ), zbeta ); + z90 = svmla_m( svptrue_b32(), z5, svget4( zq6, 1 ), zbeta ); + za0 = svmla_m( svptrue_b32(), z6, svget4( zq6, 2 ), zbeta ); + zb0 = svmla_m( svptrue_b32(), z7, svget4( zq6, 3 ), zbeta ); + + z400 = svcreate4( z40, z50, z60, z70 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 1 ) * rs_c], z400 ); + + z600 = svcreate4( z80, z90, za0, zb0 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 1 ) * rs_c], z600 ); + + // tcol + 2 + z0 = svread_ver_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 2 ); + z1 = svread_ver_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 2 ); + z2 = svread_ver_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 2 ); + z3 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 2 ); + z4 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 2 ); + z5 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 2 ); + z6 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 2 ); + z7 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 2 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + zq5 = svld1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 2 ) * rs_c ) )] ); + zq6 = svld1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( ( ( tcol + 2 ) * rs_c ) )] ); + + z40 = svmla_m( svptrue_b32(), z0, svget4( zq5, 0 ), zbeta ); + z50 = svmla_m( svptrue_b32(), z1, svget4( zq5, 1 ), zbeta ); + z60 = svmla_m( svptrue_b32(), z2, svget4( zq5, 2 ), zbeta ); + z70 = svmla_m( svptrue_b32(), z3, svget4( zq5, 3 ), zbeta ); + z80 = svmla_m( svptrue_b32(), z4, svget4( zq6, 0 ), zbeta ); + z90 = svmla_m( svptrue_b32(), z5, svget4( zq6, 1 ), zbeta ); + za0 = svmla_m( svptrue_b32(), z6, svget4( zq6, 2 ), zbeta ); + zb0 = svmla_m( svptrue_b32(), z7, svget4( zq6, 3 ), zbeta ); + + z400 = svcreate4( z40, z50, z60, z70 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 2 ) * rs_c], z400 ); + + z600 = svcreate4( z80, z90, za0, zb0 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 2 ) * rs_c], z600 ); + + // tcol + 3 + z0 = svread_ver_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 3 ); + z1 = svread_ver_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 3 ); + z2 = svread_ver_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 3 ); + z3 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 3 ); + z4 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 3 ); + z5 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 3 ); + z6 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 3 ); + z7 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 3 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + zq5 = svld1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 3 ) * rs_c ) )] ); + zq6 = svld1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( ( ( tcol + 3 ) * rs_c ) )] ); + + z40 = svmla_m( svptrue_b32(), z0, svget4( zq5, 0 ), zbeta ); + z50 = svmla_m( svptrue_b32(), z1, svget4( zq5, 1 ), zbeta ); + z60 = svmla_m( svptrue_b32(), z2, svget4( zq5, 2 ), zbeta ); + z70 = svmla_m( svptrue_b32(), z3, svget4( zq5, 3 ), zbeta ); + z80 = svmla_m( svptrue_b32(), z4, svget4( zq6, 0 ), zbeta ); + z90 = svmla_m( svptrue_b32(), z5, svget4( zq6, 1 ), zbeta ); + za0 = svmla_m( svptrue_b32(), z6, svget4( zq6, 2 ), zbeta ); + zb0 = svmla_m( svptrue_b32(), z7, svget4( zq6, 3 ), zbeta ); + + z400 = svcreate4( z40, z50, z60, z70 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 3 ) * rs_c], z400 ); + + z600 = svcreate4( z80, z90, za0, zb0 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 3 ) * rs_c], z600 ); + } + } + } + else + { + const uint64_t result_tile_TR_corner = SVL * cs_c; + const uint64_t result_tile_BL_corner = SVL * 2 * cs_c; + const uint64_t result_tile_BR_corner = SVL * 3 * cs_c; + + if ( beta_ == 0 ) + { + for ( uint64_t tcol = 0; tcol < SVL; tcol += 4 ) + { + // Read ZA slices into Z regs + svfloat64_t z0 = svread_hor_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 0 ); + svfloat64_t z1 = svread_hor_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 0 ); + svfloat64_t z2 = svread_hor_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 0 ); + svfloat64_t z3 = svread_hor_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 0 ); + svfloat64_t z4 = svread_hor_za64_m( z4, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 0 ); + svfloat64_t z5 = svread_hor_za64_m( z5, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 0 ); + svfloat64_t z6 = svread_hor_za64_m( z6, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 0 ); + svfloat64_t z7 = svread_hor_za64_m( z7, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 0 ); + + // Scale Z regs by broadcast alpha + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + // Store full result into C + svfloat64x2_t z400 = svcreate2( z0, z4 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 0 ) * cs_c], z400 ); + + svfloat64x2_t z600 = svcreate2( z1, z5 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 0 ) * cs_c], z600 ); + svfloat64x2_t z700 = svcreate2( z2, z6 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_BL_corner + ( tcol + 0 ) * cs_c], z700 ); + + svfloat64x2_t z800 = svcreate2( z3, z7 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_BR_corner + ( tcol + 0 ) * cs_c], z800 ); + + // tcol + 1 + z0 = svread_hor_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 1 ); + z1 = svread_hor_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 1 ); + z2 = svread_hor_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 1 ); + z3 = svread_hor_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 1 ); + z4 = svread_hor_za64_m( z4, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 1 ); + z5 = svread_hor_za64_m( z5, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 1 ); + z6 = svread_hor_za64_m( z6, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 1 ); + z7 = svread_hor_za64_m( z7, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 1 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + z400 = svcreate2( z0, z4 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 1 ) * cs_c], z400 ); + + z600 = svcreate2( z1, z5 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 1 ) * cs_c], z600 ); + + z700 = svcreate2( z2, z6 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_BL_corner + ( tcol + 1 ) * cs_c], z700 ); + + z800 = svcreate2( z3, z7 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_BR_corner + ( tcol + 1 ) * cs_c], z800 ); + + // tcol + 2 + z0 = svread_hor_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 2 ); + z1 = svread_hor_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 2 ); + z2 = svread_hor_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 2 ); + z3 = svread_hor_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 2 ); + z4 = svread_hor_za64_m( z4, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 2 ); + z5 = svread_hor_za64_m( z5, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 2 ); + z6 = svread_hor_za64_m( z6, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 2 ); + z7 = svread_hor_za64_m( z7, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 2 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + z400 = svcreate2( z0, z4 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 2 ) * cs_c], z400 ); + + z600 = svcreate2( z1, z5 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 2 ) * cs_c], z600 ); + + z700 = svcreate2( z2, z6 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_BL_corner + ( tcol + 2 ) * cs_c], z700 ); + + z800 = svcreate2( z3, z7 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_BR_corner + ( tcol + 2 ) * cs_c], z800 ); + + // tcol + 3 + z0 = svread_hor_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 3 ); + z1 = svread_hor_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 3 ); + z2 = svread_hor_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 3 ); + z3 = svread_hor_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 3 ); + z4 = svread_hor_za64_m( z4, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 3 ); + z5 = svread_hor_za64_m( z5, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 3 ); + z6 = svread_hor_za64_m( z6, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 3 ); + z7 = svread_hor_za64_m( z7, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 3 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + z400 = svcreate2( z0, z4 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 3 ) * cs_c], z400 ); + + z600 = svcreate2( z1, z5 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 3 ) * cs_c], z600 ); + + z700 = svcreate2( z2, z6 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_BL_corner + ( tcol + 3 ) * cs_c], z700 ); + + z800 = svcreate2( z3, z7 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_BR_corner + ( tcol + 3 ) * cs_c], z800 ); + } + } + else + { + for ( uint64_t tcol = 0; tcol < SVL; tcol += 4 ) + { + // Read ZA slices into Z regs + svfloat64_t z0 = svread_hor_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 0 ); + svfloat64_t z1 = svread_hor_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 0 ); + svfloat64_t z2 = svread_hor_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 0 ); + svfloat64_t z3 = svread_hor_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 0 ); + svfloat64_t z4 = svread_hor_za64_m( z4, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 0 ); + svfloat64_t z5 = svread_hor_za64_m( z5, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 0 ); + svfloat64_t z6 = svread_hor_za64_m( z6, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 0 ); + svfloat64_t z7 = svread_hor_za64_m( z7, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 0 ); + + // Scale Z regs by broadcast alpha + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + // Load C into Z regs + svfloat64x2_t zq5 = svld1_f64_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 0 ) * cs_c ) )] ); + svfloat64x2_t zq6 = svld1_f64_x2( svptrue_c32(), + &c_[result_tile_TR_corner + ( ( ( tcol + 0 ) * cs_c ) )] ); + svfloat64x2_t zq7 = svld1_f64_x2( svptrue_c32(), + &c_[result_tile_BL_corner + ( ( ( tcol + 0 ) * cs_c ) )] ); + svfloat64x2_t zq8 = svld1_f64_x2( svptrue_c32(), + &c_[result_tile_BR_corner + ( ( ( tcol + 0 ) * cs_c ) )] ); + + // Scale Z regs by broadcast beta (reordered ZA tiles to match + // horizontal order) + svfloat64_t z40 = svmla_m( svptrue_b32(), z0, svget2( zq5, 0 ), + zbeta ); + svfloat64_t z50 = svmla_m( svptrue_b32(), z4, svget2( zq5, 1 ), + zbeta ); + svfloat64_t z60 = svmla_m( svptrue_b32(), z1, svget2( zq6, 0 ), + zbeta ); + svfloat64_t z70 = svmla_m( svptrue_b32(), z5, svget2( zq6, 1 ), + zbeta ); + svfloat64_t z80 = svmla_m( svptrue_b32(), z2, svget2( zq7, 0 ), + zbeta ); + svfloat64_t z90 = svmla_m( svptrue_b32(), z6, svget2( zq7, 1 ), + zbeta ); + svfloat64_t za0 = svmla_m( svptrue_b32(), z3, svget2( zq8, 0 ), + zbeta ); + svfloat64_t zb0 = svmla_m( svptrue_b32(), z7, svget2( zq8, 1 ), + zbeta ); + + // Store full result into C + svfloat64x2_t z400 = svcreate2( z40, z50 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 0 ) * cs_c], z400 ); + + svfloat64x2_t z600 = svcreate2( z60, z70 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 0 ) * cs_c], z600 ); + svfloat64x2_t z700 = svcreate2( z80, z90 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_BL_corner + ( tcol + 0 ) * cs_c], z700 ); + + svfloat64x2_t z800 = svcreate2( za0, zb0 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_BR_corner + ( tcol + 0 ) * cs_c], z800 ); + + // tcol + 1 + z0 = svread_hor_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 1 ); + z1 = svread_hor_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 1 ); + z2 = svread_hor_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 1 ); + z3 = svread_hor_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 1 ); + z4 = svread_hor_za64_m( z4, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 1 ); + z5 = svread_hor_za64_m( z5, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 1 ); + z6 = svread_hor_za64_m( z6, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 1 ); + z7 = svread_hor_za64_m( z7, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 1 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + zq5 = svld1_f64_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 1 ) * cs_c ) )] ); + zq6 = svld1_f64_x2( svptrue_c32(), + &c_[result_tile_TR_corner + ( ( ( tcol + 1 ) * cs_c ) )] ); + zq7 = svld1_f64_x2( svptrue_c32(), + &c_[result_tile_BL_corner + ( ( ( tcol + 1 ) * cs_c ) )] ); + zq8 = svld1_f64_x2( svptrue_c32(), + &c_[result_tile_BR_corner + ( ( ( tcol + 1 ) * cs_c ) )] ); + + z40 = svmla_m( svptrue_b32(), z0, svget2( zq5, 0 ), zbeta ); + z50 = svmla_m( svptrue_b32(), z4, svget2( zq5, 1 ), zbeta ); + z60 = svmla_m( svptrue_b32(), z1, svget2( zq6, 0 ), zbeta ); + z70 = svmla_m( svptrue_b32(), z5, svget2( zq6, 1 ), zbeta ); + z80 = svmla_m( svptrue_b32(), z2, svget2( zq7, 0 ), zbeta ); + z90 = svmla_m( svptrue_b32(), z6, svget2( zq7, 1 ), zbeta ); + za0 = svmla_m( svptrue_b32(), z3, svget2( zq8, 0 ), zbeta ); + zb0 = svmla_m( svptrue_b32(), z7, svget2( zq8, 1 ), zbeta ); + + z400 = svcreate2( z40, z50 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 1 ) * cs_c], z400 ); + + z600 = svcreate2( z60, z70 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 1 ) * cs_c], z600 ); + + z700 = svcreate2( z80, z90 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_BL_corner + ( tcol + 1 ) * cs_c], z700 ); + + z800 = svcreate2( za0, zb0 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_BR_corner + ( tcol + 1 ) * cs_c], z800 ); + + // tcol + 2 + z0 = svread_hor_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 2 ); + z1 = svread_hor_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 2 ); + z2 = svread_hor_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 2 ); + z3 = svread_hor_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 2 ); + z4 = svread_hor_za64_m( z4, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 2 ); + z5 = svread_hor_za64_m( z5, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 2 ); + z6 = svread_hor_za64_m( z6, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 2 ); + z7 = svread_hor_za64_m( z7, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 2 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + zq5 = svld1_f64_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 2 ) * cs_c ) )] ); + zq6 = svld1_f64_x2( svptrue_c32(), + &c_[result_tile_TR_corner + ( ( ( tcol + 2 ) * cs_c ) )] ); + zq7 = svld1_f64_x2( svptrue_c32(), + &c_[result_tile_BL_corner + ( ( ( tcol + 2 ) * cs_c ) )] ); + zq8 = svld1_f64_x2( svptrue_c32(), + &c_[result_tile_BR_corner + ( ( ( tcol + 2 ) * cs_c ) )] ); + + z40 = svmla_m( svptrue_b32(), z0, svget2( zq5, 0 ), zbeta ); + z50 = svmla_m( svptrue_b32(), z4, svget2( zq5, 1 ), zbeta ); + z60 = svmla_m( svptrue_b32(), z1, svget2( zq6, 0 ), zbeta ); + z70 = svmla_m( svptrue_b32(), z5, svget2( zq6, 1 ), zbeta ); + z80 = svmla_m( svptrue_b32(), z2, svget2( zq7, 0 ), zbeta ); + z90 = svmla_m( svptrue_b32(), z6, svget2( zq7, 1 ), zbeta ); + za0 = svmla_m( svptrue_b32(), z3, svget2( zq8, 0 ), zbeta ); + zb0 = svmla_m( svptrue_b32(), z7, svget2( zq8, 1 ), zbeta ); + + z400 = svcreate2( z40, z50 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 2 ) * cs_c], z400 ); + + z600 = svcreate2( z60, z70 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 2 ) * cs_c], z600 ); + + z700 = svcreate2( z80, z90 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_BL_corner + ( tcol + 2 ) * cs_c], z700 ); + + z800 = svcreate2( za0, zb0 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_BR_corner + ( tcol + 2 ) * cs_c], z800 ); + + // tcol + 3 + z0 = svread_hor_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 3 ); + z1 = svread_hor_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 3 ); + z2 = svread_hor_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 3 ); + z3 = svread_hor_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 3 ); + z4 = svread_hor_za64_m( z4, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 3 ); + z5 = svread_hor_za64_m( z5, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 3 ); + z6 = svread_hor_za64_m( z6, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 3 ); + z7 = svread_hor_za64_m( z7, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 3 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + zq5 = svld1_f64_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 3 ) * cs_c ) )] ); + zq6 = svld1_f64_x2( svptrue_c32(), + &c_[result_tile_TR_corner + ( ( ( tcol + 3 ) * cs_c ) )] ); + zq7 = svld1_f64_x2( svptrue_c32(), + &c_[result_tile_BL_corner + ( ( ( tcol + 3 ) * cs_c ) )] ); + zq8 = svld1_f64_x2( svptrue_c32(), + &c_[result_tile_BR_corner + ( ( ( tcol + 3 ) * cs_c ) )] ); + + z40 = svmla_m( svptrue_b32(), z0, svget2( zq5, 0 ), zbeta ); + z50 = svmla_m( svptrue_b32(), z4, svget2( zq5, 1 ), zbeta ); + z60 = svmla_m( svptrue_b32(), z1, svget2( zq6, 0 ), zbeta ); + z70 = svmla_m( svptrue_b32(), z5, svget2( zq6, 1 ), zbeta ); + z80 = svmla_m( svptrue_b32(), z2, svget2( zq7, 0 ), zbeta ); + z90 = svmla_m( svptrue_b32(), z6, svget2( zq7, 1 ), zbeta ); + za0 = svmla_m( svptrue_b32(), z3, svget2( zq8, 0 ), zbeta ); + zb0 = svmla_m( svptrue_b32(), z7, svget2( zq8, 1 ), zbeta ); + + z400 = svcreate2( z40, z50 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 3 ) * cs_c], z400 ); + + z600 = svcreate2( z60, z70 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 3 ) * cs_c], z600 ); + + z700 = svcreate2( z80, z90 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_BL_corner + ( tcol + 3 ) * cs_c], z700 ); + + z800 = svcreate2( za0, zb0 ); + svst1_f64_x2( svptrue_c32(), + &c_[result_tile_BR_corner + ( tcol + 3 ) * cs_c], z800 ); + } + } + } + GEMM_UKR_FLUSH_CT( d ); + + return; +} + diff --git a/kernels/armsme/3/bli_gemm_armsme_int_s4SVLxSVL.c b/kernels/armsme/3/bli_gemm_armsme_int_s4SVLxSVL.c new file mode 100644 index 000000000..b67a4b41a --- /dev/null +++ b/kernels/armsme/3/bli_gemm_armsme_int_s4SVLxSVL.c @@ -0,0 +1,1676 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2021, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "blis.h" + +__arm_new( "za" ) __arm_locally_streaming void bli_sgemm_armsme_int_4SVLxSVL + ( + dim_t m, + dim_t n, + dim_t k, + const void* alpha, + const void* a, + const void* b, + const void* beta, + void* c, inc_t rs_c, inc_t cs_c, + const auxinfo_t* data, + const cntx_t* cntx + ) +{ + uint64_t SVL = svcntsw(); + + GEMM_UKR_SETUP_CT_AMBI( s, 4 * SVL, SVL, false ); + + float* a_ = (float*)a; + float* b_ = (float*)b; + + const void* a_next = bli_auxinfo_next_a( data ); + const void* b_next = bli_auxinfo_next_b( data ); + + float* c_ = (float*)c; + + svzero_za(); + + uint64_t k_; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; + + for ( k_ = 0; k_ < k_iter; k_++ ) + { + // Loads + svfloat32x4_t zL00 = svld1_f32_x4( svptrue_c32(), + (float32_t*)( &b_[0] ) ); + + svfloat32x4_t zR00 = svld1_f32_x4( svptrue_c32(), + (float32_t*)( &a_[0] ) ); + + svmopa_za32_m( 0, svptrue_b32(), svptrue_b32(), svget4( zL00, 0 ), + svget4( zR00, 0 ) ); + svmopa_za32_m( 1, svptrue_b32(), svptrue_b32(), svget4( zL00, 0 ), + svget4( zR00, 1 ) ); + + svfloat32x4_t zR01 = svld1_f32_x4( svptrue_c32(), + (float32_t*)( &a_[( 4 * SVL )] ) ); + + svmopa_za32_m( 2, svptrue_b32(), svptrue_b32(), svget4( zL00, 0 ), + svget4( zR00, 2 ) ); + svmopa_za32_m( 3, svptrue_b32(), svptrue_b32(), svget4( zL00, 0 ), + svget4( zR00, 3 ) ); + + svmopa_za32_m( 0, svptrue_b32(), svptrue_b32(), svget4( zL00, 1 ), + svget4( zR01, 0 ) ); + svmopa_za32_m( 1, svptrue_b32(), svptrue_b32(), svget4( zL00, 1 ), + svget4( zR01, 1 ) ); + + svfloat32x4_t zR02 = svld1_f32_x4( svptrue_c32(), + (float32_t*)( &a_[2 * ( 4 * SVL )] ) ); + + svmopa_za32_m( 2, svptrue_b32(), svptrue_b32(), svget4( zL00, 1 ), + svget4( zR01, 2 ) ); + svmopa_za32_m( 3, svptrue_b32(), svptrue_b32(), svget4( zL00, 1 ), + svget4( zR01, 3 ) ); + + svmopa_za32_m( 0, svptrue_b32(), svptrue_b32(), svget4( zL00, 2 ), + svget4( zR02, 0 ) ); + svmopa_za32_m( 1, svptrue_b32(), svptrue_b32(), svget4( zL00, 2 ), + svget4( zR02, 1 ) ); + + svfloat32x4_t zR03 = svld1_f32_x4( svptrue_c32(), + (float32_t*)( &a_[3 * ( 4 * SVL )] ) ); + + svmopa_za32_m( 2, svptrue_b32(), svptrue_b32(), svget4( zL00, 2 ), + svget4( zR02, 2 ) ); + svmopa_za32_m( 3, svptrue_b32(), svptrue_b32(), svget4( zL00, 2 ), + svget4( zR02, 3 ) ); + + svmopa_za32_m( 0, svptrue_b32(), svptrue_b32(), svget4( zL00, 3 ), + svget4( zR03, 0 ) ); + svprfb( svptrue_b32(), (float*)&a_next, 0 ); + svmopa_za32_m( 1, svptrue_b32(), svptrue_b32(), svget4( zL00, 3 ), + svget4( zR03, 1 ) ); + + svprfb( svptrue_b32(), (float*)&b_next, 0 ); + svmopa_za32_m( 2, svptrue_b32(), svptrue_b32(), svget4( zL00, 3 ), + svget4( zR03, 2 ) ); + svmopa_za32_m( 3, svptrue_b32(), svptrue_b32(), svget4( zL00, 3 ), + svget4( zR03, 3 ) ); + + b_ += ( 4 * SVL ); + a_ += ( 4 * 4 * SVL ); + } + + for ( k_ = 0; k_ < k_left; k_ += 1 ) + { + svfloat32_t zL00 = svld1_f32( svptrue_b32(), (float32_t*)( &b_[0] ) ); + svfloat32x4_t zR00 = svld1_f32_x4( svptrue_c32(), + (float32_t*)( &a_[0] ) ); + + svmopa_za32_m( 0, svptrue_b32(), svptrue_b32(), zL00, + svget4( zR00, 0 ) ); + svmopa_za32_m( 1, svptrue_b32(), svptrue_b32(), zL00, + svget4( zR00, 1 ) ); + + svmopa_za32_m( 2, svptrue_b32(), svptrue_b32(), zL00, + svget4( zR00, 2 ) ); + svmopa_za32_m( 3, svptrue_b32(), svptrue_b32(), zL00, + svget4( zR00, 3 ) ); + + b_ += ( SVL ); + a_ += ( 4 * SVL ); + } + + float beta_ = *(float*)beta; + float alpha_ = *(float*)alpha; + + const uint64_t result_tile_TL_corner = 0; + const uint64_t result_tile_BL_corner = SVL * rs_c; + const uint64_t result_tile_TR_corner = SVL * 2 * rs_c; + const uint64_t result_tile_BR_corner = SVL * 3 * rs_c; + + svfloat32_t zbeta = svdup_f32( beta_ ); + svfloat32_t zalpha = svdup_f32( alpha_ ); + + if ( cs_c == 1 ) + { + if ( beta_ == 0 ) + { + for ( uint64_t tcol = 0; tcol < SVL; tcol += 4 ) + { + // Read ZA slices into Z regs + svfloat32_t z0 = svread_ver_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 0 ); + svfloat32_t z1 = svread_ver_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 0 ); + svfloat32_t z2 = svread_ver_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 0 ); + svfloat32_t z3 = svread_ver_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 0 ); + + // Scale Z regs by broadcast alpha + z0 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + // Store full result into C + svst1( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 0 ) * rs_c], z0 ); + svst1( svptrue_b32(), + &c_[result_tile_BL_corner + ( tcol + 0 ) * rs_c], z1 ); + svst1( svptrue_b32(), + &c_[result_tile_TR_corner + ( tcol + 0 ) * rs_c], z2 ); + svst1( svptrue_b32(), + &c_[result_tile_BR_corner + ( tcol + 0 ) * rs_c], z3 ); + + // Repeat unfolded x4 + z0 = svread_ver_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 1 ); + z1 = svread_ver_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 1 ); + z2 = svread_ver_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 1 ); + z3 = svread_ver_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 1 ); + + z0 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + svst1( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 1 ) * rs_c], z0 ); + svst1( svptrue_b32(), + &c_[result_tile_BL_corner + ( tcol + 1 ) * rs_c], z1 ); + svst1( svptrue_b32(), + &c_[result_tile_TR_corner + ( tcol + 1 ) * rs_c], z2 ); + svst1( svptrue_b32(), + &c_[result_tile_BR_corner + ( tcol + 1 ) * rs_c], z3 ); + + z0 = svread_ver_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 2 ); + z1 = svread_ver_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 2 ); + z2 = svread_ver_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 2 ); + z3 = svread_ver_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 2 ); + + z0 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + svst1( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 2 ) * rs_c], z0 ); + svst1( svptrue_b32(), + &c_[result_tile_BL_corner + ( tcol + 2 ) * rs_c], z1 ); + svst1( svptrue_b32(), + &c_[result_tile_TR_corner + ( tcol + 2 ) * rs_c], z2 ); + svst1( svptrue_b32(), + &c_[result_tile_BR_corner + ( tcol + 2 ) * rs_c], z3 ); + + z0 = svread_ver_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 3 ); + z1 = svread_ver_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 3 ); + z2 = svread_ver_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 3 ); + z3 = svread_ver_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 3 ); + + z0 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + svst1( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 3 ) * rs_c], z0 ); + svst1( svptrue_b32(), + &c_[result_tile_BL_corner + ( tcol + 3 ) * rs_c], z1 ); + svst1( svptrue_b32(), + &c_[result_tile_TR_corner + ( tcol + 3 ) * rs_c], z2 ); + svst1( svptrue_b32(), + &c_[result_tile_BR_corner + ( tcol + 3 ) * rs_c], z3 ); + } + } + else + { + for ( uint64_t tcol = 0; tcol < SVL; tcol += 4 ) + { + // Read ZA slices into Z regs + svfloat32_t z0 = svread_ver_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 0 ); + svfloat32_t z1 = svread_ver_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 0 ); + svfloat32_t z2 = svread_ver_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 0 ); + svfloat32_t z3 = svread_ver_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 0 ); + + // Scale Z regs by broadcast alpha + z0 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + // Load C into Z regs + svfloat32_t z4 = svld1_f32( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 0 ) * rs_c] ); + svfloat32_t z5 = svld1_f32( svptrue_b32(), + &c_[result_tile_BL_corner + ( tcol + 0 ) * rs_c] ); + svfloat32_t z6 = svld1_f32( svptrue_b32(), + &c_[result_tile_TR_corner + ( tcol + 0 ) * rs_c] ); + svfloat32_t z7 = svld1_f32( svptrue_b32(), + &c_[result_tile_BR_corner + ( tcol + 0 ) * rs_c] ); + + // Scale Z regs by broadcast beta + z4 = svmla_m( svptrue_b32(), z0, z4, zbeta ); + z5 = svmla_m( svptrue_b32(), z1, z5, zbeta ); + z6 = svmla_m( svptrue_b32(), z2, z6, zbeta ); + z7 = svmla_m( svptrue_b32(), z3, z7, zbeta ); + + // Store full result into C + svst1( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 0 ) * rs_c], z4 ); + svst1( svptrue_b32(), + &c_[result_tile_BL_corner + ( tcol + 0 ) * rs_c], z5 ); + svst1( svptrue_b32(), + &c_[result_tile_TR_corner + ( tcol + 0 ) * rs_c], z6 ); + svst1( svptrue_b32(), + &c_[result_tile_BR_corner + ( tcol + 0 ) * rs_c], z7 ); + + // Repeat unfolded x4 + z0 = svread_ver_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 1 ); + z1 = svread_ver_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 1 ); + z2 = svread_ver_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 1 ); + z3 = svread_ver_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 1 ); + + z0 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + z4 = svld1_f32( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 1 ) * rs_c] ); + z5 = svld1_f32( svptrue_b32(), + &c_[result_tile_BL_corner + ( tcol + 1 ) * rs_c] ); + z6 = svld1_f32( svptrue_b32(), + &c_[result_tile_TR_corner + ( tcol + 1 ) * rs_c] ); + z7 = svld1_f32( svptrue_b32(), + &c_[result_tile_BR_corner + ( tcol + 1 ) * rs_c] ); + + z4 = svmla_m( svptrue_b32(), z0, z4, zbeta ); + z5 = svmla_m( svptrue_b32(), z1, z5, zbeta ); + z6 = svmla_m( svptrue_b32(), z2, z6, zbeta ); + z7 = svmla_m( svptrue_b32(), z3, z7, zbeta ); + + svst1( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 1 ) * rs_c], z4 ); + svst1( svptrue_b32(), + &c_[result_tile_BL_corner + ( tcol + 1 ) * rs_c], z5 ); + svst1( svptrue_b32(), + &c_[result_tile_TR_corner + ( tcol + 1 ) * rs_c], z6 ); + svst1( svptrue_b32(), + &c_[result_tile_BR_corner + ( tcol + 1 ) * rs_c], z7 ); + + z0 = svread_ver_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 2 ); + z1 = svread_ver_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 2 ); + z2 = svread_ver_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 2 ); + z3 = svread_ver_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 2 ); + + z0 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + z4 = svld1_f32( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 2 ) * rs_c] ); + z5 = svld1_f32( svptrue_b32(), + &c_[result_tile_BL_corner + ( tcol + 2 ) * rs_c] ); + z6 = svld1_f32( svptrue_b32(), + &c_[result_tile_TR_corner + ( tcol + 2 ) * rs_c] ); + z7 = svld1_f32( svptrue_b32(), + &c_[result_tile_BR_corner + ( tcol + 2 ) * rs_c] ); + + z4 = svmla_m( svptrue_b32(), z0, z4, zbeta ); + z5 = svmla_m( svptrue_b32(), z1, z5, zbeta ); + z6 = svmla_m( svptrue_b32(), z2, z6, zbeta ); + z7 = svmla_m( svptrue_b32(), z3, z7, zbeta ); + + svst1( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 2 ) * rs_c], z4 ); + svst1( svptrue_b32(), + &c_[result_tile_BL_corner + ( tcol + 2 ) * rs_c], z5 ); + svst1( svptrue_b32(), + &c_[result_tile_TR_corner + ( tcol + 2 ) * rs_c], z6 ); + svst1( svptrue_b32(), + &c_[result_tile_BR_corner + ( tcol + 2 ) * rs_c], z7 ); + + z0 = svread_ver_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 3 ); + z1 = svread_ver_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 3 ); + z2 = svread_ver_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 3 ); + z3 = svread_ver_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 3 ); + + z0 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + z4 = svld1_f32( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 3 ) * rs_c] ); + z5 = svld1_f32( svptrue_b32(), + &c_[result_tile_BL_corner + ( tcol + 3 ) * rs_c] ); + z6 = svld1_f32( svptrue_b32(), + &c_[result_tile_TR_corner + ( tcol + 3 ) * rs_c] ); + z7 = svld1_f32( svptrue_b32(), + &c_[result_tile_BR_corner + ( tcol + 3 ) * rs_c] ); + + z4 = svmla_m( svptrue_b32(), z0, z4, zbeta ); + z5 = svmla_m( svptrue_b32(), z1, z5, zbeta ); + z6 = svmla_m( svptrue_b32(), z2, z6, zbeta ); + z7 = svmla_m( svptrue_b32(), z3, z7, zbeta ); + + svst1( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 3 ) * rs_c], z4 ); + svst1( svptrue_b32(), + &c_[result_tile_BL_corner + ( tcol + 3 ) * rs_c], z5 ); + svst1( svptrue_b32(), + &c_[result_tile_TR_corner + ( tcol + 3 ) * rs_c], z6 ); + svst1( svptrue_b32(), + &c_[result_tile_BR_corner + ( tcol + 3 ) * rs_c], z7 ); + } + } + } + else + { + if ( beta_ == 0 ) + { + for ( uint64_t tcol = 0; tcol < SVL; tcol += 4 ) + { + // Read ZA slices into Z regs + svfloat32_t z0 = svread_hor_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 0 ); + svfloat32_t z1 = svread_hor_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 0 ); + svfloat32_t z2 = svread_hor_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 0 ); + svfloat32_t z3 = svread_hor_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 0 ); + + // Scale Z regs by broadcast alpha + z0 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + // Store full result into C + svfloat32x4_t z4w = svcreate4( z0, z1, z2, z3 ); + svst1_f32_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 0 ) * cs_c], z4w ); + + z0 = svread_hor_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 1 ); + z1 = svread_hor_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 1 ); + z2 = svread_hor_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 1 ); + z3 = svread_hor_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 1 ); + + // Repeat unfolded x4 + svfloat32_t z4 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + svfloat32_t z5 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + svfloat32_t z6 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + svfloat32_t z7 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + svfloat32x4_t z5w = svcreate4( z4, z5, z6, z7 ); + svst1_f32_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 1 ) * cs_c], z5w ); + + z0 = svread_hor_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 2 ); + z1 = svread_hor_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 2 ); + z2 = svread_hor_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 2 ); + z3 = svread_hor_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 2 ); + + z0 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + z4w = svcreate4( z0, z1, z2, z3 ); + svst1_f32_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 2 ) * cs_c], z4w ); + + z0 = svread_hor_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 3 ); + z1 = svread_hor_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 3 ); + z2 = svread_hor_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 3 ); + z3 = svread_hor_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 3 ); + + z4 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z5 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z6 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z7 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + z5w = svcreate4( z4, z5, z6, z7 ); + svst1_f32_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 3 ) * cs_c], z5w ); + } + } + else + { + for ( uint64_t tcol = 0; tcol < SVL; tcol += 4 ) + { + // Read ZA slices into Z regs + svfloat32_t z0 = svread_hor_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 0 ); + svfloat32_t z1 = svread_hor_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 0 ); + svfloat32_t z2 = svread_hor_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 0 ); + svfloat32_t z3 = svread_hor_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 0 ); + + // Scale Z regs by broadcast alpha + z0 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + // Load C into Z regs + svfloat32x4_t z4q = svld1_f32_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 0 ) * cs_c] ); + + // Scale Z regs by broadcast beta + svfloat32_t z4 = svmla_m( svptrue_b32(), z0, svget4( z4q, 0 ), + zbeta ); + svfloat32_t z5 = svmla_m( svptrue_b32(), z1, svget4( z4q, 1 ), + zbeta ); + svfloat32_t z6 = svmla_m( svptrue_b32(), z2, svget4( z4q, 2 ), + zbeta ); + svfloat32_t z7 = svmla_m( svptrue_b32(), z3, svget4( z4q, 3 ), + zbeta ); + + // Store full result into C + svfloat32x4_t z4w = svcreate4( z4, z5, z6, z7 ); + svst1_f32_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 0 ) * cs_c], z4w ); + + // Repeat unfolded x4 + z0 = svread_hor_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 1 ); + z1 = svread_hor_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 1 ); + z2 = svread_hor_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 1 ); + z3 = svread_hor_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 1 ); + + z0 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + svfloat32x4_t z5q = svld1_f32_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 1 ) * cs_c] ); + + svfloat32_t z8 = svmla_m( svptrue_b32(), z0, svget4( z5q, 0 ), + zbeta ); + svfloat32_t z9 = svmla_m( svptrue_b32(), z1, svget4( z5q, 1 ), + zbeta ); + svfloat32_t z10 = svmla_m( svptrue_b32(), z2, svget4( z5q, 2 ), + zbeta ); + svfloat32_t z11 = svmla_m( svptrue_b32(), z3, svget4( z5q, 3 ), + zbeta ); + + svfloat32x4_t z5w = svcreate4( z8, z9, z10, z11 ); + svst1_f32_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 1 ) * cs_c], z5w ); + + z0 = svread_hor_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 2 ); + z1 = svread_hor_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 2 ); + z2 = svread_hor_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 2 ); + z3 = svread_hor_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 2 ); + + z0 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + z4q = svld1_f32_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 2 ) * cs_c] ); + + z4 = svmla_m( svptrue_b32(), z0, svget4( z4q, 0 ), zbeta ); + z5 = svmla_m( svptrue_b32(), z1, svget4( z4q, 1 ), zbeta ); + z6 = svmla_m( svptrue_b32(), z2, svget4( z4q, 2 ), zbeta ); + z7 = svmla_m( svptrue_b32(), z3, svget4( z4q, 3 ), zbeta ); + + z4w = svcreate4( z4, z5, z6, z7 ); + svst1_f32_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 2 ) * cs_c], z4w ); + + z0 = svread_hor_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 3 ); + z1 = svread_hor_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 3 ); + z2 = svread_hor_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 3 ); + z3 = svread_hor_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 3 ); + + z0 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + z5q = svld1_f32_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 3 ) * cs_c] ); + + z8 = svmla_m( svptrue_b32(), z0, svget4( z5q, 0 ), zbeta ); + z9 = svmla_m( svptrue_b32(), z1, svget4( z5q, 1 ), zbeta ); + z10 = svmla_m( svptrue_b32(), z2, svget4( z5q, 2 ), zbeta ); + z11 = svmla_m( svptrue_b32(), z3, svget4( z5q, 3 ), zbeta ); + + z5w = svcreate4( z8, z9, z10, z11 ); + svst1_f32_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 3 ) * cs_c], z5w ); + } + } + } + + GEMM_UKR_FLUSH_CT( s ); + + return; +} +__arm_new( "za" ) __arm_locally_streaming void bli_dgemm_armsme_int_8SVLxSVL + ( + dim_t m, + dim_t n, + dim_t k, + const void* alpha, + const void* a, + const void* b, + const void* beta, + void* c, inc_t rs_c, inc_t cs_c, + const auxinfo_t* data, + const cntx_t* cntx + ) +{ + uint64_t SVL = svcntsd(); + + GEMM_UKR_SETUP_CT_AMBI( d, 8 * SVL, SVL, false ); + + double* a_ = (double*)a; + double* b_ = (double*)b; + double* c_ = (double*)c; + + svzero_za(); + + uint64_t k_; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; + + for ( k_ = 0; k_ < k_iter; k_++ ) + { + // Loads + svfloat64x4_t zL00 = svld1_f64_x4( svptrue_c32(), + (float64_t*)( &a_[0] ) ); + svfloat64x4_t zL01 = svld1_f64_x4( svptrue_c32(), + (float64_t*)( &a_[4 * SVL] ) ); + svfloat64_t zR00 = svld1_f64( svptrue_b32(), (float64_t*)( &b_[0] ) ); + + svmopa_za64_m( 0, svptrue_b32(), svptrue_b32(), svget4( zL00, 0 ), + zR00 ); + svmopa_za64_m( 1, svptrue_b32(), svptrue_b32(), svget4( zL00, 1 ), + zR00 ); + + svmopa_za64_m( 2, svptrue_b32(), svptrue_b32(), svget4( zL00, 2 ), + zR00 ); + svmopa_za64_m( 3, svptrue_b32(), svptrue_b32(), svget4( zL00, 3 ), + zR00 ); + + svmopa_za64_m( 4, svptrue_b32(), svptrue_b32(), svget4( zL01, 0 ), + zR00 ); + svmopa_za64_m( 5, svptrue_b32(), svptrue_b32(), svget4( zL01, 1 ), + zR00 ); + + svmopa_za64_m( 6, svptrue_b32(), svptrue_b32(), svget4( zL01, 2 ), + zR00 ); + svmopa_za64_m( 7, svptrue_b32(), svptrue_b32(), svget4( zL01, 3 ), + zR00 ); + + svfloat64x4_t zL02 = svld1_f64_x4( svptrue_c32(), + (float64_t*)( &a_[8 * SVL] ) ); + svfloat64x4_t zL03 = svld1_f64_x4( svptrue_c32(), + (float64_t*)( &a_[12 * SVL] ) ); + svfloat64_t zR01 = svld1_f64( svptrue_b32(), + (float64_t*)( &b_[1 * SVL] ) ); + + svmopa_za64_m( 0, svptrue_b32(), svptrue_b32(), svget4( zL02, 0 ), + zR01 ); + svmopa_za64_m( 1, svptrue_b32(), svptrue_b32(), svget4( zL02, 1 ), + zR01 ); + + svmopa_za64_m( 2, svptrue_b32(), svptrue_b32(), svget4( zL02, 2 ), + zR01 ); + svmopa_za64_m( 3, svptrue_b32(), svptrue_b32(), svget4( zL02, 3 ), + zR01 ); + + svmopa_za64_m( 4, svptrue_b32(), svptrue_b32(), svget4( zL03, 0 ), + zR01 ); + svmopa_za64_m( 5, svptrue_b32(), svptrue_b32(), svget4( zL03, 1 ), + zR01 ); + + svmopa_za64_m( 6, svptrue_b32(), svptrue_b32(), svget4( zL03, 2 ), + zR01 ); + svmopa_za64_m( 7, svptrue_b32(), svptrue_b32(), svget4( zL03, 3 ), + zR01 ); + + svfloat64x4_t zL04 = svld1_f64_x4( svptrue_c32(), + (float64_t*)( &a_[16 * SVL] ) ); + svfloat64x4_t zL05 = svld1_f64_x4( svptrue_c32(), + (float64_t*)( &a_[20 * SVL] ) ); + svfloat64_t zR02 = svld1_f64( svptrue_b32(), + (float64_t*)( &b_[2 * SVL] ) ); + + svmopa_za64_m( 0, svptrue_b32(), svptrue_b32(), svget4( zL04, 0 ), + zR02 ); + svmopa_za64_m( 1, svptrue_b32(), svptrue_b32(), svget4( zL04, 1 ), + zR02 ); + + svmopa_za64_m( 2, svptrue_b32(), svptrue_b32(), svget4( zL04, 2 ), + zR02 ); + svmopa_za64_m( 3, svptrue_b32(), svptrue_b32(), svget4( zL04, 3 ), + zR02 ); + + svmopa_za64_m( 4, svptrue_b32(), svptrue_b32(), svget4( zL05, 0 ), + zR02 ); + svmopa_za64_m( 5, svptrue_b32(), svptrue_b32(), svget4( zL05, 1 ), + zR02 ); + + svmopa_za64_m( 6, svptrue_b32(), svptrue_b32(), svget4( zL05, 2 ), + zR02 ); + svmopa_za64_m( 7, svptrue_b32(), svptrue_b32(), svget4( zL05, 3 ), + zR02 ); + + svfloat64x4_t zL06 = svld1_f64_x4( svptrue_c32(), + (float64_t*)( &a_[24 * SVL] ) ); + svfloat64x4_t zL07 = svld1_f64_x4( svptrue_c32(), + (float64_t*)( &a_[28 * SVL] ) ); + svfloat64_t zR03 = svld1_f64( svptrue_b32(), + (float64_t*)( &b_[3 * SVL] ) ); + + svmopa_za64_m( 0, svptrue_b32(), svptrue_b32(), svget4( zL06, 0 ), + zR03 ); + svmopa_za64_m( 1, svptrue_b32(), svptrue_b32(), svget4( zL06, 1 ), + zR03 ); + + svmopa_za64_m( 2, svptrue_b32(), svptrue_b32(), svget4( zL06, 2 ), + zR03 ); + svmopa_za64_m( 3, svptrue_b32(), svptrue_b32(), svget4( zL06, 3 ), + zR03 ); + + svmopa_za64_m( 4, svptrue_b32(), svptrue_b32(), svget4( zL07, 0 ), + zR03 ); + svmopa_za64_m( 5, svptrue_b32(), svptrue_b32(), svget4( zL07, 1 ), + zR03 ); + + svmopa_za64_m( 6, svptrue_b32(), svptrue_b32(), svget4( zL07, 2 ), + zR03 ); + svmopa_za64_m( 7, svptrue_b32(), svptrue_b32(), svget4( zL07, 3 ), + zR03 ); + + a_ += ( 4 * 8 * SVL ); + b_ += ( 4 * SVL ); + } + + for ( k_ = 0; k_ < k_left; k_ += 1 ) + { + svfloat64x4_t zL00 = svld1_f64_x4( svptrue_c32(), + (float64_t*)( &a_[0] ) ); + svfloat64x4_t zL01 = svld1_f64_x4( svptrue_c32(), + (float64_t*)( &a_[4 * SVL] ) ); + svfloat64_t zR00 = svld1_f64( svptrue_b32(), (float64_t*)( &b_[0] ) ); + + svmopa_za64_m( 0, svptrue_b32(), svptrue_b32(), svget4( zL00, 0 ), + zR00 ); + svmopa_za64_m( 1, svptrue_b32(), svptrue_b32(), svget4( zL00, 1 ), + zR00 ); + + svmopa_za64_m( 2, svptrue_b32(), svptrue_b32(), svget4( zL00, 2 ), + zR00 ); + svmopa_za64_m( 3, svptrue_b32(), svptrue_b32(), svget4( zL00, 3 ), + zR00 ); + + svmopa_za64_m( 4, svptrue_b32(), svptrue_b32(), svget4( zL01, 0 ), + zR00 ); + svmopa_za64_m( 5, svptrue_b32(), svptrue_b32(), svget4( zL01, 1 ), + zR00 ); + + svmopa_za64_m( 6, svptrue_b32(), svptrue_b32(), svget4( zL01, 2 ), + zR00 ); + svmopa_za64_m( 7, svptrue_b32(), svptrue_b32(), svget4( zL01, 3 ), + zR00 ); + + a_ += ( 8 * SVL ); + b_ += ( SVL ); + } + + double beta_ = *(double*)beta; + double alpha_ = *(double*)alpha; + + const uint64_t result_tile_TL_corner = 0; + + svfloat64_t zbeta = svdup_f64( beta_ ); + svfloat64_t zalpha = svdup_f64( alpha_ ); + + if ( rs_c == 1 ) + { + const uint64_t result_tile_TR_corner = 4 * SVL; + + if ( beta_ == 0 ) + { + for ( uint64_t tcol = 0; tcol < SVL; tcol += 4 ) + { + // Read ZA slices into Z regs + svfloat64_t z0 = svread_ver_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 0 ); + svfloat64_t z1 = svread_ver_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 0 ); + svfloat64_t z2 = svread_ver_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 0 ); + svfloat64_t z3 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 0 ); + svfloat64_t z4 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 0 ); + svfloat64_t z5 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 0 ); + svfloat64_t z6 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 0 ); + svfloat64_t z7 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 0 ); + + // Scale Z regs by broadcast alpha + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + // Store full result into C + svfloat64x4_t z400 = svcreate4( z0, z1, z2, z3 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 0 ) * cs_c], z400 ); + + svfloat64x4_t z600 = svcreate4( z4, z5, z6, z7 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 0 ) * cs_c], z600 ); + + // tcol + 1 + z0 = svread_ver_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 1 ); + z1 = svread_ver_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 1 ); + z2 = svread_ver_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 1 ); + z3 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 1 ); + z4 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 1 ); + z5 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 1 ); + z6 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 1 ); + z7 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 1 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + z400 = svcreate4( z0, z1, z2, z3 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 1 ) * cs_c], z400 ); + + z600 = svcreate4( z4, z5, z6, z7 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 1 ) * cs_c], z600 ); + + // tcol + 2 + z0 = svread_ver_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 2 ); + z1 = svread_ver_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 2 ); + z2 = svread_ver_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 2 ); + z3 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 2 ); + z4 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 2 ); + z5 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 2 ); + z6 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 2 ); + z7 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 2 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + z400 = svcreate4( z0, z1, z2, z3 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 2 ) * cs_c], z400 ); + + z600 = svcreate4( z4, z5, z6, z7 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 2 ) * cs_c], z600 ); + + // tcol + 3 + z0 = svread_ver_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 3 ); + z1 = svread_ver_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 3 ); + z2 = svread_ver_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 3 ); + z3 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 3 ); + z4 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 3 ); + z5 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 3 ); + z6 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 3 ); + z7 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 3 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + z400 = svcreate4( z0, z1, z2, z3 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 3 ) * cs_c], z400 ); + + z600 = svcreate4( z4, z5, z6, z7 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 3 ) * cs_c], z600 ); + } + } + else + { + for ( uint64_t tcol = 0; tcol < SVL; tcol += 4 ) + { + // Read ZA slices into Z regs + svfloat64_t z0 = svread_ver_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 0 ); + svfloat64_t z1 = svread_ver_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 0 ); + svfloat64_t z2 = svread_ver_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 0 ); + svfloat64_t z3 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 0 ); + svfloat64_t z4 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 0 ); + svfloat64_t z5 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 0 ); + svfloat64_t z6 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 0 ); + svfloat64_t z7 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 0 ); + + // Scale Z regs by broadcast alpha + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + // Load C into Z regs + svfloat64x4_t zq5 = svld1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 0 ) * cs_c ) )] ); + svfloat64x4_t zq6 = svld1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( ( ( tcol + 0 ) * cs_c ) )] ); + + // Scale Z regs by broadcast beta + svfloat64_t z40 = svmla_m( svptrue_b32(), z0, svget4( zq5, 0 ), + zbeta ); + svfloat64_t z50 = svmla_m( svptrue_b32(), z1, svget4( zq5, 1 ), + zbeta ); + svfloat64_t z60 = svmla_m( svptrue_b32(), z2, svget4( zq5, 2 ), + zbeta ); + svfloat64_t z70 = svmla_m( svptrue_b32(), z3, svget4( zq5, 3 ), + zbeta ); + svfloat64_t z80 = svmla_m( svptrue_b32(), z4, svget4( zq6, 0 ), + zbeta ); + svfloat64_t z90 = svmla_m( svptrue_b32(), z5, svget4( zq6, 1 ), + zbeta ); + svfloat64_t za0 = svmla_m( svptrue_b32(), z6, svget4( zq6, 2 ), + zbeta ); + svfloat64_t zb0 = svmla_m( svptrue_b32(), z7, svget4( zq6, 3 ), + zbeta ); + + // Store full result into C + svfloat64x4_t z400 = svcreate4( z40, z50, z60, z70 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 0 ) * cs_c], z400 ); + + svfloat64x4_t z600 = svcreate4( z80, z90, za0, zb0 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 0 ) * cs_c], z600 ); + + // tcol + 1 + z0 = svread_ver_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 1 ); + z1 = svread_ver_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 1 ); + z2 = svread_ver_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 1 ); + z3 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 1 ); + z4 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 1 ); + z5 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 1 ); + z6 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 1 ); + z7 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 1 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + zq5 = svld1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 1 ) * cs_c ) )] ); + zq6 = svld1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( ( ( tcol + 1 ) * cs_c ) )] ); + + z40 = svmla_m( svptrue_b32(), z0, svget4( zq5, 0 ), zbeta ); + z50 = svmla_m( svptrue_b32(), z1, svget4( zq5, 1 ), zbeta ); + z60 = svmla_m( svptrue_b32(), z2, svget4( zq5, 2 ), zbeta ); + z70 = svmla_m( svptrue_b32(), z3, svget4( zq5, 3 ), zbeta ); + z80 = svmla_m( svptrue_b32(), z4, svget4( zq6, 0 ), zbeta ); + z90 = svmla_m( svptrue_b32(), z5, svget4( zq6, 1 ), zbeta ); + za0 = svmla_m( svptrue_b32(), z6, svget4( zq6, 2 ), zbeta ); + zb0 = svmla_m( svptrue_b32(), z7, svget4( zq6, 3 ), zbeta ); + + z400 = svcreate4( z40, z50, z60, z70 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 1 ) * cs_c], z400 ); + + z600 = svcreate4( z80, z90, za0, zb0 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 1 ) * cs_c], z600 ); + + // tcol + 2 + z0 = svread_ver_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 2 ); + z1 = svread_ver_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 2 ); + z2 = svread_ver_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 2 ); + z3 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 2 ); + z4 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 2 ); + z5 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 2 ); + z6 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 2 ); + z7 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 2 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + zq5 = svld1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 2 ) * cs_c ) )] ); + zq6 = svld1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( ( ( tcol + 2 ) * cs_c ) )] ); + + z40 = svmla_m( svptrue_b32(), z0, svget4( zq5, 0 ), zbeta ); + z50 = svmla_m( svptrue_b32(), z1, svget4( zq5, 1 ), zbeta ); + z60 = svmla_m( svptrue_b32(), z2, svget4( zq5, 2 ), zbeta ); + z70 = svmla_m( svptrue_b32(), z3, svget4( zq5, 3 ), zbeta ); + z80 = svmla_m( svptrue_b32(), z4, svget4( zq6, 0 ), zbeta ); + z90 = svmla_m( svptrue_b32(), z5, svget4( zq6, 1 ), zbeta ); + za0 = svmla_m( svptrue_b32(), z6, svget4( zq6, 2 ), zbeta ); + zb0 = svmla_m( svptrue_b32(), z7, svget4( zq6, 3 ), zbeta ); + + z400 = svcreate4( z40, z50, z60, z70 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 2 ) * cs_c], z400 ); + + z600 = svcreate4( z80, z90, za0, zb0 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 2 ) * cs_c], z600 ); + + // tcol + 3 + z0 = svread_ver_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 3 ); + z1 = svread_ver_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 3 ); + z2 = svread_ver_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 3 ); + z3 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 3 ); + z4 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 3 ); + z5 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 3 ); + z6 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 3 ); + z7 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 3 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + zq5 = svld1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 3 ) * cs_c ) )] ); + zq6 = svld1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( ( ( tcol + 3 ) * cs_c ) )] ); + + z40 = svmla_m( svptrue_b32(), z0, svget4( zq5, 0 ), zbeta ); + z50 = svmla_m( svptrue_b32(), z1, svget4( zq5, 1 ), zbeta ); + z60 = svmla_m( svptrue_b32(), z2, svget4( zq5, 2 ), zbeta ); + z70 = svmla_m( svptrue_b32(), z3, svget4( zq5, 3 ), zbeta ); + z80 = svmla_m( svptrue_b32(), z4, svget4( zq6, 0 ), zbeta ); + z90 = svmla_m( svptrue_b32(), z5, svget4( zq6, 1 ), zbeta ); + za0 = svmla_m( svptrue_b32(), z6, svget4( zq6, 2 ), zbeta ); + zb0 = svmla_m( svptrue_b32(), z7, svget4( zq6, 3 ), zbeta ); + + z400 = svcreate4( z40, z50, z60, z70 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 3 ) * cs_c], z400 ); + + z600 = svcreate4( z80, z90, za0, zb0 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 3 ) * cs_c], z600 ); + } + } + } + else + { + const uint64_t result_tile_1 = SVL * rs_c; + const uint64_t result_tile_2 = SVL * 2 * rs_c; + const uint64_t result_tile_3 = SVL * 3 * rs_c; + const uint64_t result_tile_4 = SVL * 4 * rs_c; + const uint64_t result_tile_5 = SVL * 5 * rs_c; + const uint64_t result_tile_6 = SVL * 6 * rs_c; + const uint64_t result_tile_7 = SVL * 7 * rs_c; + + if ( beta_ == 0 ) + { + for ( uint64_t tcol = 0; tcol < SVL; tcol += 4 ) + { + // Read ZA slices into Z regs + svfloat64_t z0 = svread_hor_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 0 ); + svfloat64_t z1 = svread_hor_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 0 ); + svfloat64_t z2 = svread_hor_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 0 ); + svfloat64_t z3 = svread_hor_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 0 ); + svfloat64_t z4 = svread_hor_za64_m( z4, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 0 ); + svfloat64_t z5 = svread_hor_za64_m( z5, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 0 ); + svfloat64_t z6 = svread_hor_za64_m( z6, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 0 ); + svfloat64_t z7 = svread_hor_za64_m( z7, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 0 ); + + // Scale Z regs by broadcast alpha + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + // Store full result into C + svst1_f64( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 0 ) * rs_c], z0 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_1 + ( tcol + 0 ) * rs_c], z1 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_2 + ( tcol + 0 ) * rs_c], z2 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_3 + ( tcol + 0 ) * rs_c], z3 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_4 + ( tcol + 0 ) * rs_c], z4 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_5 + ( tcol + 0 ) * rs_c], z5 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_6 + ( tcol + 0 ) * rs_c], z6 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_7 + ( tcol + 0 ) * rs_c], z7 ); + + // tcol + 1 + z0 = svread_hor_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 1 ); + z1 = svread_hor_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 1 ); + z2 = svread_hor_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 1 ); + z3 = svread_hor_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 1 ); + z4 = svread_hor_za64_m( z4, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 1 ); + z5 = svread_hor_za64_m( z5, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 1 ); + z6 = svread_hor_za64_m( z6, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 1 ); + z7 = svread_hor_za64_m( z7, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 1 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + svst1_f64( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 1 ) * rs_c], z0 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_1 + ( tcol + 1 ) * rs_c], z1 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_2 + ( tcol + 1 ) * rs_c], z2 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_3 + ( tcol + 1 ) * rs_c], z3 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_4 + ( tcol + 1 ) * rs_c], z4 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_5 + ( tcol + 1 ) * rs_c], z5 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_6 + ( tcol + 1 ) * rs_c], z6 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_7 + ( tcol + 1 ) * rs_c], z7 ); + + // tcol + 2 + z0 = svread_hor_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 2 ); + z1 = svread_hor_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 2 ); + z2 = svread_hor_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 2 ); + z3 = svread_hor_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 2 ); + z4 = svread_hor_za64_m( z4, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 2 ); + z5 = svread_hor_za64_m( z5, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 2 ); + z6 = svread_hor_za64_m( z6, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 2 ); + z7 = svread_hor_za64_m( z7, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 2 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + svst1_f64( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 2 ) * rs_c], z0 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_1 + ( tcol + 2 ) * rs_c], z1 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_2 + ( tcol + 2 ) * rs_c], z2 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_3 + ( tcol + 2 ) * rs_c], z3 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_4 + ( tcol + 2 ) * rs_c], z4 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_5 + ( tcol + 2 ) * rs_c], z5 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_6 + ( tcol + 2 ) * rs_c], z6 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_7 + ( tcol + 2 ) * rs_c], z7 ); + + // tcol + 3 + z0 = svread_hor_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 3 ); + z1 = svread_hor_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 3 ); + z2 = svread_hor_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 3 ); + z3 = svread_hor_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 3 ); + z4 = svread_hor_za64_m( z4, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 3 ); + z5 = svread_hor_za64_m( z5, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 3 ); + z6 = svread_hor_za64_m( z6, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 3 ); + z7 = svread_hor_za64_m( z7, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 3 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + svst1_f64( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 3 ) * rs_c], z0 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_1 + ( tcol + 3 ) * rs_c], z1 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_2 + ( tcol + 3 ) * rs_c], z2 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_3 + ( tcol + 3 ) * rs_c], z3 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_4 + ( tcol + 3 ) * rs_c], z4 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_5 + ( tcol + 3 ) * rs_c], z5 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_6 + ( tcol + 3 ) * rs_c], z6 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_7 + ( tcol + 3 ) * rs_c], z7 ); + } + } + else + { + for ( uint64_t tcol = 0; tcol < SVL; tcol += 4 ) + { + // Read ZA slices into Z regs + svfloat64_t z0 = svread_hor_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 0 ); + svfloat64_t z1 = svread_hor_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 0 ); + svfloat64_t z2 = svread_hor_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 0 ); + svfloat64_t z3 = svread_hor_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 0 ); + svfloat64_t z4 = svread_hor_za64_m( z4, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 0 ); + svfloat64_t z5 = svread_hor_za64_m( z5, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 0 ); + svfloat64_t z6 = svread_hor_za64_m( z6, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 0 ); + svfloat64_t z7 = svread_hor_za64_m( z7, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 0 ); + + // Scale Z regs by broadcast alpha + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + // Load C into Z regs + svfloat64_t zq0 = svld1_f64( svptrue_b32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 0 ) * rs_c ) )] ); + svfloat64_t zq1 = svld1_f64( svptrue_b32(), + &c_[result_tile_1 + ( ( ( tcol + 0 ) * rs_c ) )] ); + svfloat64_t zq2 = svld1_f64( svptrue_b32(), + &c_[result_tile_2 + ( ( ( tcol + 0 ) * rs_c ) )] ); + svfloat64_t zq3 = svld1_f64( svptrue_b32(), + &c_[result_tile_3 + ( ( ( tcol + 0 ) * rs_c ) )] ); + svfloat64_t zq4 = svld1_f64( svptrue_b32(), + &c_[result_tile_4 + ( ( ( tcol + 0 ) * rs_c ) )] ); + svfloat64_t zq5 = svld1_f64( svptrue_b32(), + &c_[result_tile_5 + ( ( ( tcol + 0 ) * rs_c ) )] ); + svfloat64_t zq6 = svld1_f64( svptrue_b32(), + &c_[result_tile_6 + ( ( ( tcol + 0 ) * rs_c ) )] ); + svfloat64_t zq7 = svld1_f64( svptrue_b32(), + &c_[result_tile_7 + ( ( ( tcol + 0 ) * rs_c ) )] ); + + // Scale Z regs by broadcast beta + svfloat64_t z00 = svmla_m( svptrue_b32(), z0, zq0, zbeta ); + svfloat64_t z10 = svmla_m( svptrue_b32(), z1, zq1, zbeta ); + svfloat64_t z20 = svmla_m( svptrue_b32(), z2, zq2, zbeta ); + svfloat64_t z30 = svmla_m( svptrue_b32(), z3, zq3, zbeta ); + svfloat64_t z40 = svmla_m( svptrue_b32(), z4, zq4, zbeta ); + svfloat64_t z50 = svmla_m( svptrue_b32(), z5, zq5, zbeta ); + svfloat64_t z60 = svmla_m( svptrue_b32(), z6, zq6, zbeta ); + svfloat64_t z70 = svmla_m( svptrue_b32(), z7, zq7, zbeta ); + + // Store full result into C + svst1_f64( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 0 ) * rs_c], z00 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_1 + ( tcol + 0 ) * rs_c], z10 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_2 + ( tcol + 0 ) * rs_c], z20 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_3 + ( tcol + 0 ) * rs_c], z30 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_4 + ( tcol + 0 ) * rs_c], z40 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_5 + ( tcol + 0 ) * rs_c], z50 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_6 + ( tcol + 0 ) * rs_c], z60 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_7 + ( tcol + 0 ) * rs_c], z70 ); + + // tcol + 1 + z0 = svread_hor_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 1 ); + z1 = svread_hor_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 1 ); + z2 = svread_hor_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 1 ); + z3 = svread_hor_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 1 ); + z4 = svread_hor_za64_m( z4, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 1 ); + z5 = svread_hor_za64_m( z5, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 1 ); + z6 = svread_hor_za64_m( z6, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 1 ); + z7 = svread_hor_za64_m( z7, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 1 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + zq0 = svld1_f64( svptrue_b32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 1 ) * rs_c ) )] ); + zq1 = svld1_f64( svptrue_b32(), + &c_[result_tile_1 + ( ( ( tcol + 1 ) * rs_c ) )] ); + zq2 = svld1_f64( svptrue_b32(), + &c_[result_tile_2 + ( ( ( tcol + 1 ) * rs_c ) )] ); + zq3 = svld1_f64( svptrue_b32(), + &c_[result_tile_3 + ( ( ( tcol + 1 ) * rs_c ) )] ); + zq4 = svld1_f64( svptrue_b32(), + &c_[result_tile_4 + ( ( ( tcol + 1 ) * rs_c ) )] ); + zq5 = svld1_f64( svptrue_b32(), + &c_[result_tile_5 + ( ( ( tcol + 1 ) * rs_c ) )] ); + zq6 = svld1_f64( svptrue_b32(), + &c_[result_tile_6 + ( ( ( tcol + 1 ) * rs_c ) )] ); + zq7 = svld1_f64( svptrue_b32(), + &c_[result_tile_7 + ( ( ( tcol + 1 ) * rs_c ) )] ); + + z00 = svmla_m( svptrue_b32(), z0, zq0, zbeta ); + z10 = svmla_m( svptrue_b32(), z1, zq1, zbeta ); + z20 = svmla_m( svptrue_b32(), z2, zq2, zbeta ); + z30 = svmla_m( svptrue_b32(), z3, zq3, zbeta ); + z40 = svmla_m( svptrue_b32(), z4, zq4, zbeta ); + z50 = svmla_m( svptrue_b32(), z5, zq5, zbeta ); + z60 = svmla_m( svptrue_b32(), z6, zq6, zbeta ); + z70 = svmla_m( svptrue_b32(), z7, zq7, zbeta ); + + svst1_f64( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 1 ) * rs_c], z00 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_1 + ( tcol + 1 ) * rs_c], z10 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_2 + ( tcol + 1 ) * rs_c], z20 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_3 + ( tcol + 1 ) * rs_c], z30 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_4 + ( tcol + 1 ) * rs_c], z40 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_5 + ( tcol + 1 ) * rs_c], z50 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_6 + ( tcol + 1 ) * rs_c], z60 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_7 + ( tcol + 1 ) * rs_c], z70 ); + + // tcol + 2 + z0 = svread_hor_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 2 ); + z1 = svread_hor_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 2 ); + z2 = svread_hor_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 2 ); + z3 = svread_hor_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 2 ); + z4 = svread_hor_za64_m( z4, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 2 ); + z5 = svread_hor_za64_m( z5, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 2 ); + z6 = svread_hor_za64_m( z6, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 2 ); + z7 = svread_hor_za64_m( z7, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 2 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + zq0 = svld1_f64( svptrue_b32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 2 ) * rs_c ) )] ); + zq1 = svld1_f64( svptrue_b32(), + &c_[result_tile_1 + ( ( ( tcol + 2 ) * rs_c ) )] ); + zq2 = svld1_f64( svptrue_b32(), + &c_[result_tile_2 + ( ( ( tcol + 2 ) * rs_c ) )] ); + zq3 = svld1_f64( svptrue_b32(), + &c_[result_tile_3 + ( ( ( tcol + 2 ) * rs_c ) )] ); + zq4 = svld1_f64( svptrue_b32(), + &c_[result_tile_4 + ( ( ( tcol + 2 ) * rs_c ) )] ); + zq5 = svld1_f64( svptrue_b32(), + &c_[result_tile_5 + ( ( ( tcol + 2 ) * rs_c ) )] ); + zq6 = svld1_f64( svptrue_b32(), + &c_[result_tile_6 + ( ( ( tcol + 2 ) * rs_c ) )] ); + zq7 = svld1_f64( svptrue_b32(), + &c_[result_tile_7 + ( ( ( tcol + 2 ) * rs_c ) )] ); + + z00 = svmla_m( svptrue_b32(), z0, zq0, zbeta ); + z10 = svmla_m( svptrue_b32(), z1, zq1, zbeta ); + z20 = svmla_m( svptrue_b32(), z2, zq2, zbeta ); + z30 = svmla_m( svptrue_b32(), z3, zq3, zbeta ); + z40 = svmla_m( svptrue_b32(), z4, zq4, zbeta ); + z50 = svmla_m( svptrue_b32(), z5, zq5, zbeta ); + z60 = svmla_m( svptrue_b32(), z6, zq6, zbeta ); + z70 = svmla_m( svptrue_b32(), z7, zq7, zbeta ); + + svst1_f64( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 2 ) * rs_c], z00 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_1 + ( tcol + 2 ) * rs_c], z10 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_2 + ( tcol + 2 ) * rs_c], z20 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_3 + ( tcol + 2 ) * rs_c], z30 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_4 + ( tcol + 2 ) * rs_c], z40 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_5 + ( tcol + 2 ) * rs_c], z50 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_6 + ( tcol + 2 ) * rs_c], z60 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_7 + ( tcol + 2 ) * rs_c], z70 ); + + // tcol + 3 + z0 = svread_hor_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 3 ); + z1 = svread_hor_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 3 ); + z2 = svread_hor_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 3 ); + z3 = svread_hor_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 3 ); + z4 = svread_hor_za64_m( z4, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 3 ); + z5 = svread_hor_za64_m( z5, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 3 ); + z6 = svread_hor_za64_m( z6, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 3 ); + z7 = svread_hor_za64_m( z7, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 3 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + zq0 = svld1_f64( svptrue_b32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 3 ) * rs_c ) )] ); + zq1 = svld1_f64( svptrue_b32(), + &c_[result_tile_1 + ( ( ( tcol + 3 ) * rs_c ) )] ); + zq2 = svld1_f64( svptrue_b32(), + &c_[result_tile_2 + ( ( ( tcol + 3 ) * rs_c ) )] ); + zq3 = svld1_f64( svptrue_b32(), + &c_[result_tile_3 + ( ( ( tcol + 3 ) * rs_c ) )] ); + zq4 = svld1_f64( svptrue_b32(), + &c_[result_tile_4 + ( ( ( tcol + 3 ) * rs_c ) )] ); + zq5 = svld1_f64( svptrue_b32(), + &c_[result_tile_5 + ( ( ( tcol + 3 ) * rs_c ) )] ); + zq6 = svld1_f64( svptrue_b32(), + &c_[result_tile_6 + ( ( ( tcol + 3 ) * rs_c ) )] ); + zq7 = svld1_f64( svptrue_b32(), + &c_[result_tile_7 + ( ( ( tcol + 3 ) * rs_c ) )] ); + + z00 = svmla_m( svptrue_b32(), z0, zq0, zbeta ); + z10 = svmla_m( svptrue_b32(), z1, zq1, zbeta ); + z20 = svmla_m( svptrue_b32(), z2, zq2, zbeta ); + z30 = svmla_m( svptrue_b32(), z3, zq3, zbeta ); + z40 = svmla_m( svptrue_b32(), z4, zq4, zbeta ); + z50 = svmla_m( svptrue_b32(), z5, zq5, zbeta ); + z60 = svmla_m( svptrue_b32(), z6, zq6, zbeta ); + z70 = svmla_m( svptrue_b32(), z7, zq7, zbeta ); + + svst1_f64( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 3 ) * rs_c], z00 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_1 + ( tcol + 3 ) * rs_c], z10 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_2 + ( tcol + 3 ) * rs_c], z20 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_3 + ( tcol + 3 ) * rs_c], z30 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_4 + ( tcol + 3 ) * rs_c], z40 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_5 + ( tcol + 3 ) * rs_c], z50 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_6 + ( tcol + 3 ) * rs_c], z60 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_7 + ( tcol + 3 ) * rs_c], z70 ); + } + } + } + + GEMM_UKR_FLUSH_CT( d ); + + return; +} + diff --git a/kernels/armsme/3/bli_gemm_armsme_int_sSVLx4SVL.c b/kernels/armsme/3/bli_gemm_armsme_int_sSVLx4SVL.c new file mode 100644 index 000000000..eefd19bcf --- /dev/null +++ b/kernels/armsme/3/bli_gemm_armsme_int_sSVLx4SVL.c @@ -0,0 +1,1674 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2021, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "blis.h" + +__arm_new( "za" ) __arm_locally_streaming void bli_sgemm_armsme_int_SVLx4SVL + ( + dim_t m, + dim_t n, + dim_t k, + const void* alpha, + const void* a, + const void* b, + const void* beta, + void* c, inc_t rs_c, inc_t cs_c, + const auxinfo_t* data, + const cntx_t* cntx + ) +{ + uint64_t SVL = svcntsw(); + + GEMM_UKR_SETUP_CT_AMBI( s, SVL, 4 * SVL, false ); + + float* a_ = (float*)a; + float* b_ = (float*)b; + + const void* a_next = bli_auxinfo_next_a( data ); + const void* b_next = bli_auxinfo_next_b( data ); + + float* c_ = (float*)c; + + svzero_za(); + + uint64_t k_; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; + + for ( k_ = 0; k_ < k_iter; k_++ ) + { + // Loads + svfloat32x4_t zL00 = svld1_f32_x4( svptrue_c32(), + (float32_t*)( &a_[0] ) ); + + svfloat32x4_t zR00 = svld1_f32_x4( svptrue_c32(), + (float32_t*)( &b_[0] ) ); + + svmopa_za32_m( 0, svptrue_b32(), svptrue_b32(), svget4( zL00, 0 ), + svget4( zR00, 0 ) ); + svmopa_za32_m( 1, svptrue_b32(), svptrue_b32(), svget4( zL00, 0 ), + svget4( zR00, 1 ) ); + + svfloat32x4_t zR01 = svld1_f32_x4( svptrue_c32(), + (float32_t*)( &b_[( 4 * SVL )] ) ); + + svmopa_za32_m( 2, svptrue_b32(), svptrue_b32(), svget4( zL00, 0 ), + svget4( zR00, 2 ) ); + svmopa_za32_m( 3, svptrue_b32(), svptrue_b32(), svget4( zL00, 0 ), + svget4( zR00, 3 ) ); + + svmopa_za32_m( 0, svptrue_b32(), svptrue_b32(), svget4( zL00, 1 ), + svget4( zR01, 0 ) ); + svmopa_za32_m( 1, svptrue_b32(), svptrue_b32(), svget4( zL00, 1 ), + svget4( zR01, 1 ) ); + + svfloat32x4_t zR02 = svld1_f32_x4( svptrue_c32(), + (float32_t*)( &b_[2 * ( 4 * SVL )] ) ); + + svmopa_za32_m( 2, svptrue_b32(), svptrue_b32(), svget4( zL00, 1 ), + svget4( zR01, 2 ) ); + svmopa_za32_m( 3, svptrue_b32(), svptrue_b32(), svget4( zL00, 1 ), + svget4( zR01, 3 ) ); + + svmopa_za32_m( 0, svptrue_b32(), svptrue_b32(), svget4( zL00, 2 ), + svget4( zR02, 0 ) ); + svmopa_za32_m( 1, svptrue_b32(), svptrue_b32(), svget4( zL00, 2 ), + svget4( zR02, 1 ) ); + + svfloat32x4_t zR03 = svld1_f32_x4( svptrue_c32(), + (float32_t*)( &b_[3 * ( 4 * SVL )] ) ); + + svmopa_za32_m( 2, svptrue_b32(), svptrue_b32(), svget4( zL00, 2 ), + svget4( zR02, 2 ) ); + svmopa_za32_m( 3, svptrue_b32(), svptrue_b32(), svget4( zL00, 2 ), + svget4( zR02, 3 ) ); + + svmopa_za32_m( 0, svptrue_b32(), svptrue_b32(), svget4( zL00, 3 ), + svget4( zR03, 0 ) ); + svprfb( svptrue_b32(), (float*)&a_next, 0 ); + svmopa_za32_m( 1, svptrue_b32(), svptrue_b32(), svget4( zL00, 3 ), + svget4( zR03, 1 ) ); + + svprfb( svptrue_b32(), (float*)&b_next, 0 ); + svmopa_za32_m( 2, svptrue_b32(), svptrue_b32(), svget4( zL00, 3 ), + svget4( zR03, 2 ) ); + svmopa_za32_m( 3, svptrue_b32(), svptrue_b32(), svget4( zL00, 3 ), + svget4( zR03, 3 ) ); + + a_ += ( 4 * SVL ); + b_ += ( 4 * 4 * SVL ); + } + + for ( k_ = 0; k_ < k_left; k_ += 1 ) + { + svfloat32_t zL00 = svld1_f32( svptrue_b32(), (float32_t*)( &a_[0] ) ); + svfloat32x4_t zR00 = svld1_f32_x4( svptrue_c32(), + (float32_t*)( &b_[0] ) ); + + svmopa_za32_m( 0, svptrue_b32(), svptrue_b32(), zL00, + svget4( zR00, 0 ) ); + svmopa_za32_m( 1, svptrue_b32(), svptrue_b32(), zL00, + svget4( zR00, 1 ) ); + + svmopa_za32_m( 2, svptrue_b32(), svptrue_b32(), zL00, + svget4( zR00, 2 ) ); + svmopa_za32_m( 3, svptrue_b32(), svptrue_b32(), zL00, + svget4( zR00, 3 ) ); + + a_ += ( SVL ); + b_ += ( 4 * SVL ); + } + + float beta_ = *(float*)beta; + float alpha_ = *(float*)alpha; + + const uint64_t result_tile_TL_corner = 0; + const uint64_t result_tile_BL_corner = SVL * cs_c; + const uint64_t result_tile_TR_corner = SVL * 2 * cs_c; + const uint64_t result_tile_BR_corner = SVL * 3 * cs_c; + + svfloat32_t zbeta = svdup_f32( beta_ ); + svfloat32_t zalpha = svdup_f32( alpha_ ); + + if ( rs_c == 1 ) + { + if ( beta_ == 0 ) + { + for ( uint64_t tcol = 0; tcol < SVL; tcol += 4 ) + { + // Read ZA slices into Z regs + svfloat32_t z0 = svread_ver_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 0 ); + svfloat32_t z1 = svread_ver_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 0 ); + svfloat32_t z2 = svread_ver_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 0 ); + svfloat32_t z3 = svread_ver_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 0 ); + + // Scale Z regs by broadcast alpha + z0 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + // Store full result into C + svst1( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 0 ) * cs_c], z0 ); + svst1( svptrue_b32(), + &c_[result_tile_BL_corner + ( tcol + 0 ) * cs_c], z1 ); + svst1( svptrue_b32(), + &c_[result_tile_TR_corner + ( tcol + 0 ) * cs_c], z2 ); + svst1( svptrue_b32(), + &c_[result_tile_BR_corner + ( tcol + 0 ) * cs_c], z3 ); + + // Repeat unfolded x4 + z0 = svread_ver_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 1 ); + z1 = svread_ver_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 1 ); + z2 = svread_ver_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 1 ); + z3 = svread_ver_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 1 ); + + z0 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + svst1( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 1 ) * cs_c], z0 ); + svst1( svptrue_b32(), + &c_[result_tile_BL_corner + ( tcol + 1 ) * cs_c], z1 ); + svst1( svptrue_b32(), + &c_[result_tile_TR_corner + ( tcol + 1 ) * cs_c], z2 ); + svst1( svptrue_b32(), + &c_[result_tile_BR_corner + ( tcol + 1 ) * cs_c], z3 ); + + z0 = svread_ver_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 2 ); + z1 = svread_ver_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 2 ); + z2 = svread_ver_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 2 ); + z3 = svread_ver_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 2 ); + + z0 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + svst1( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 2 ) * cs_c], z0 ); + svst1( svptrue_b32(), + &c_[result_tile_BL_corner + ( tcol + 2 ) * cs_c], z1 ); + svst1( svptrue_b32(), + &c_[result_tile_TR_corner + ( tcol + 2 ) * cs_c], z2 ); + svst1( svptrue_b32(), + &c_[result_tile_BR_corner + ( tcol + 2 ) * cs_c], z3 ); + + z0 = svread_ver_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 3 ); + z1 = svread_ver_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 3 ); + z2 = svread_ver_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 3 ); + z3 = svread_ver_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 3 ); + + z0 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + svst1( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 3 ) * cs_c], z0 ); + svst1( svptrue_b32(), + &c_[result_tile_BL_corner + ( tcol + 3 ) * cs_c], z1 ); + svst1( svptrue_b32(), + &c_[result_tile_TR_corner + ( tcol + 3 ) * cs_c], z2 ); + svst1( svptrue_b32(), + &c_[result_tile_BR_corner + ( tcol + 3 ) * cs_c], z3 ); + } + } + else + { + for ( uint64_t tcol = 0; tcol < SVL; tcol += 4 ) + { + // Read ZA slices into Z regs + svfloat32_t z0 = svread_ver_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 0 ); + svfloat32_t z1 = svread_ver_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 0 ); + svfloat32_t z2 = svread_ver_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 0 ); + svfloat32_t z3 = svread_ver_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 0 ); + + // Scale Z regs by broadcast alpha + z0 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + // Load C into Z regs + svfloat32_t z4 = svld1_f32( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 0 ) * cs_c] ); + svfloat32_t z5 = svld1_f32( svptrue_b32(), + &c_[result_tile_BL_corner + ( tcol + 0 ) * cs_c] ); + svfloat32_t z6 = svld1_f32( svptrue_b32(), + &c_[result_tile_TR_corner + ( tcol + 0 ) * cs_c] ); + svfloat32_t z7 = svld1_f32( svptrue_b32(), + &c_[result_tile_BR_corner + ( tcol + 0 ) * cs_c] ); + + // Scale Z regs by broadcast beta + z4 = svmla_m( svptrue_b32(), z0, z4, zbeta ); + z5 = svmla_m( svptrue_b32(), z1, z5, zbeta ); + z6 = svmla_m( svptrue_b32(), z2, z6, zbeta ); + z7 = svmla_m( svptrue_b32(), z3, z7, zbeta ); + + // Store full result into C + svst1( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 0 ) * cs_c], z4 ); + svst1( svptrue_b32(), + &c_[result_tile_BL_corner + ( tcol + 0 ) * cs_c], z5 ); + svst1( svptrue_b32(), + &c_[result_tile_TR_corner + ( tcol + 0 ) * cs_c], z6 ); + svst1( svptrue_b32(), + &c_[result_tile_BR_corner + ( tcol + 0 ) * cs_c], z7 ); + + // Repeat unfolded x4 + z0 = svread_ver_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 1 ); + z1 = svread_ver_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 1 ); + z2 = svread_ver_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 1 ); + z3 = svread_ver_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 1 ); + + z0 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + z4 = svld1_f32( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 1 ) * cs_c] ); + z5 = svld1_f32( svptrue_b32(), + &c_[result_tile_BL_corner + ( tcol + 1 ) * cs_c] ); + z6 = svld1_f32( svptrue_b32(), + &c_[result_tile_TR_corner + ( tcol + 1 ) * cs_c] ); + z7 = svld1_f32( svptrue_b32(), + &c_[result_tile_BR_corner + ( tcol + 1 ) * cs_c] ); + + z4 = svmla_m( svptrue_b32(), z0, z4, zbeta ); + z5 = svmla_m( svptrue_b32(), z1, z5, zbeta ); + z6 = svmla_m( svptrue_b32(), z2, z6, zbeta ); + z7 = svmla_m( svptrue_b32(), z3, z7, zbeta ); + + svst1( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 1 ) * cs_c], z4 ); + svst1( svptrue_b32(), + &c_[result_tile_BL_corner + ( tcol + 1 ) * cs_c], z5 ); + svst1( svptrue_b32(), + &c_[result_tile_TR_corner + ( tcol + 1 ) * cs_c], z6 ); + svst1( svptrue_b32(), + &c_[result_tile_BR_corner + ( tcol + 1 ) * cs_c], z7 ); + + z0 = svread_ver_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 2 ); + z1 = svread_ver_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 2 ); + z2 = svread_ver_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 2 ); + z3 = svread_ver_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 2 ); + + z0 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + z4 = svld1_f32( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 2 ) * cs_c] ); + z5 = svld1_f32( svptrue_b32(), + &c_[result_tile_BL_corner + ( tcol + 2 ) * cs_c] ); + z6 = svld1_f32( svptrue_b32(), + &c_[result_tile_TR_corner + ( tcol + 2 ) * cs_c] ); + z7 = svld1_f32( svptrue_b32(), + &c_[result_tile_BR_corner + ( tcol + 2 ) * cs_c] ); + + z4 = svmla_m( svptrue_b32(), z0, z4, zbeta ); + z5 = svmla_m( svptrue_b32(), z1, z5, zbeta ); + z6 = svmla_m( svptrue_b32(), z2, z6, zbeta ); + z7 = svmla_m( svptrue_b32(), z3, z7, zbeta ); + + svst1( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 2 ) * cs_c], z4 ); + svst1( svptrue_b32(), + &c_[result_tile_BL_corner + ( tcol + 2 ) * cs_c], z5 ); + svst1( svptrue_b32(), + &c_[result_tile_TR_corner + ( tcol + 2 ) * cs_c], z6 ); + svst1( svptrue_b32(), + &c_[result_tile_BR_corner + ( tcol + 2 ) * cs_c], z7 ); + + z0 = svread_ver_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 3 ); + z1 = svread_ver_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 3 ); + z2 = svread_ver_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 3 ); + z3 = svread_ver_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 3 ); + + z0 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + z4 = svld1_f32( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 3 ) * cs_c] ); + z5 = svld1_f32( svptrue_b32(), + &c_[result_tile_BL_corner + ( tcol + 3 ) * cs_c] ); + z6 = svld1_f32( svptrue_b32(), + &c_[result_tile_TR_corner + ( tcol + 3 ) * cs_c] ); + z7 = svld1_f32( svptrue_b32(), + &c_[result_tile_BR_corner + ( tcol + 3 ) * cs_c] ); + + z4 = svmla_m( svptrue_b32(), z0, z4, zbeta ); + z5 = svmla_m( svptrue_b32(), z1, z5, zbeta ); + z6 = svmla_m( svptrue_b32(), z2, z6, zbeta ); + z7 = svmla_m( svptrue_b32(), z3, z7, zbeta ); + + svst1( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 3 ) * cs_c], z4 ); + svst1( svptrue_b32(), + &c_[result_tile_BL_corner + ( tcol + 3 ) * cs_c], z5 ); + svst1( svptrue_b32(), + &c_[result_tile_TR_corner + ( tcol + 3 ) * cs_c], z6 ); + svst1( svptrue_b32(), + &c_[result_tile_BR_corner + ( tcol + 3 ) * cs_c], z7 ); + } + } + } + else + { + if ( beta_ == 0 ) + { + for ( uint64_t tcol = 0; tcol < SVL; tcol += 4 ) + { + // Read ZA slices into Z regs + svfloat32_t z0 = svread_hor_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 0 ); + svfloat32_t z1 = svread_hor_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 0 ); + svfloat32_t z2 = svread_hor_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 0 ); + svfloat32_t z3 = svread_hor_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 0 ); + + // Scale Z regs by broadcast alpha + z0 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + // Store full result into C + svfloat32x4_t z4w = svcreate4( z0, z1, z2, z3 ); + svst1_f32_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 0 ) * rs_c], z4w ); + + // Repeat unfolded x4 + z0 = svread_hor_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 1 ); + z1 = svread_hor_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 1 ); + z2 = svread_hor_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 1 ); + z3 = svread_hor_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 1 ); + + z0 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + svfloat32x4_t z5w = svcreate4( z0, z1, z2, z3 ); + svst1_f32_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 1 ) * rs_c], z5w ); + + z0 = svread_hor_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 2 ); + z1 = svread_hor_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 2 ); + z2 = svread_hor_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 2 ); + z3 = svread_hor_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 2 ); + + z0 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + svfloat32x4_t z6w = svcreate4( z0, z1, z2, z3 ); + svst1_f32_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 2 ) * rs_c], z6w ); + + z0 = svread_hor_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 3 ); + z1 = svread_hor_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 3 ); + z2 = svread_hor_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 3 ); + z3 = svread_hor_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 3 ); + + z0 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + svfloat32x4_t z7w = svcreate4( z0, z1, z2, z3 ); + svst1_f32_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 3 ) * rs_c], z7w ); + } + } + else + { + for ( uint64_t tcol = 0; tcol < SVL; tcol += 4 ) + { + // Read ZA slices into Z regs + svfloat32_t z0 = svread_hor_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 0 ); + svfloat32_t z1 = svread_hor_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 0 ); + svfloat32_t z2 = svread_hor_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 0 ); + svfloat32_t z3 = svread_hor_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 0 ); + + // Scale Z regs by broadcast alpha + z0 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + // Load C into Z regs + svfloat32x4_t z4q = svld1_f32_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 0 ) * rs_c] ); + + // Scale Z regs by broadcast beta + svfloat32_t z4 = svmla_m( svptrue_b32(), z0, svget4( z4q, 0 ), + zbeta ); + svfloat32_t z5 = svmla_m( svptrue_b32(), z1, svget4( z4q, 1 ), + zbeta ); + svfloat32_t z6 = svmla_m( svptrue_b32(), z2, svget4( z4q, 2 ), + zbeta ); + svfloat32_t z7 = svmla_m( svptrue_b32(), z3, svget4( z4q, 3 ), + zbeta ); + + // Store full result into C + svfloat32x4_t z4w = svcreate4( z4, z5, z6, z7 ); + svst1_f32_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 0 ) * rs_c], z4w ); + + // Repeat unfolded x4 + z0 = svread_hor_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 1 ); + z1 = svread_hor_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 1 ); + z2 = svread_hor_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 1 ); + z3 = svread_hor_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 1 ); + + z0 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + svfloat32x4_t z5q = svld1_f32_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 1 ) * rs_c] ); + + z4 = svmla_m( svptrue_b32(), z0, svget4( z5q, 0 ), zbeta ); + z5 = svmla_m( svptrue_b32(), z1, svget4( z5q, 1 ), zbeta ); + z6 = svmla_m( svptrue_b32(), z2, svget4( z5q, 2 ), zbeta ); + z7 = svmla_m( svptrue_b32(), z3, svget4( z5q, 3 ), zbeta ); + + svfloat32x4_t z5w = svcreate4( z4, z5, z6, z7 ); + svst1_f32_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 1 ) * rs_c], z5w ); + + z0 = svread_hor_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 2 ); + z1 = svread_hor_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 2 ); + z2 = svread_hor_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 2 ); + z3 = svread_hor_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 2 ); + + z0 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + svfloat32x4_t z6q = svld1_f32_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 2 ) * rs_c] ); + + z4 = svmla_m( svptrue_b32(), z0, svget4( z6q, 0 ), zbeta ); + z5 = svmla_m( svptrue_b32(), z1, svget4( z6q, 1 ), zbeta ); + z6 = svmla_m( svptrue_b32(), z2, svget4( z6q, 2 ), zbeta ); + z7 = svmla_m( svptrue_b32(), z3, svget4( z6q, 3 ), zbeta ); + + svfloat32x4_t z6w = svcreate4( z4, z5, z6, z7 ); + svst1_f32_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 2 ) * rs_c], z6w ); + + z0 = svread_hor_za32_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 3 ); + z1 = svread_hor_za32_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 3 ); + z2 = svread_hor_za32_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 3 ); + z3 = svread_hor_za32_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 3 ); + + z0 = svmul_f32_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f32_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f32_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f32_z( svptrue_b32(), z3, zalpha ); + + svfloat32x4_t z7q = svld1_f32_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 3 ) * rs_c] ); + + z4 = svmla_m( svptrue_b32(), z0, svget4( z7q, 0 ), zbeta ); + z5 = svmla_m( svptrue_b32(), z1, svget4( z7q, 1 ), zbeta ); + z6 = svmla_m( svptrue_b32(), z2, svget4( z7q, 2 ), zbeta ); + z7 = svmla_m( svptrue_b32(), z3, svget4( z7q, 3 ), zbeta ); + + svfloat32x4_t z7w = svcreate4( z4, z5, z6, z7 ); + svst1_f32_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 3 ) * rs_c], z7w ); + } + } + } + + GEMM_UKR_FLUSH_CT( s ); + + return; +} + +__arm_new( "za" ) __arm_locally_streaming void bli_dgemm_armsme_int_SVLx8SVL + ( + dim_t m, + dim_t n, + dim_t k, + const void* alpha, + const void* a, + const void* b, + const void* beta, + void* c, inc_t rs_c, inc_t cs_c, + const auxinfo_t* data, + const cntx_t* cntx + ) +{ + uint64_t SVL = svcntsd(); + + GEMM_UKR_SETUP_CT_AMBI( d, SVL, 8 * SVL, false ); + + double* a_ = (double*)a; + double* b_ = (double*)b; + double* c_ = (double*)c; + + svzero_za(); + + uint64_t k_; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; + + for ( k_ = 0; k_ < k_iter; k_++ ) + { + // Loads + svfloat64x4_t zL00 = svld1_f64_x4( svptrue_c32(), + (float64_t*)( &b_[0] ) ); + svfloat64x4_t zL01 = svld1_f64_x4( svptrue_c32(), + (float64_t*)( &b_[4 * SVL] ) ); + svfloat64_t zR00 = svld1_f64( svptrue_b32(), (float64_t*)( &a_[0] ) ); + + svmopa_za64_m( 0, svptrue_b32(), svptrue_b32(), svget4( zL00, 0 ), + zR00 ); + svmopa_za64_m( 1, svptrue_b32(), svptrue_b32(), svget4( zL00, 1 ), + zR00 ); + + svmopa_za64_m( 2, svptrue_b32(), svptrue_b32(), svget4( zL00, 2 ), + zR00 ); + svmopa_za64_m( 3, svptrue_b32(), svptrue_b32(), svget4( zL00, 3 ), + zR00 ); + + svmopa_za64_m( 4, svptrue_b32(), svptrue_b32(), svget4( zL01, 0 ), + zR00 ); + svmopa_za64_m( 5, svptrue_b32(), svptrue_b32(), svget4( zL01, 1 ), + zR00 ); + + svmopa_za64_m( 6, svptrue_b32(), svptrue_b32(), svget4( zL01, 2 ), + zR00 ); + svmopa_za64_m( 7, svptrue_b32(), svptrue_b32(), svget4( zL01, 3 ), + zR00 ); + + svfloat64x4_t zL02 = svld1_f64_x4( svptrue_c32(), + (float64_t*)( &b_[8 * SVL] ) ); + svfloat64x4_t zL03 = svld1_f64_x4( svptrue_c32(), + (float64_t*)( &b_[12 * SVL] ) ); + svfloat64_t zR01 = svld1_f64( svptrue_b32(), + (float64_t*)( &a_[1 * SVL] ) ); + + svmopa_za64_m( 0, svptrue_b32(), svptrue_b32(), svget4( zL02, 0 ), + zR01 ); + svmopa_za64_m( 1, svptrue_b32(), svptrue_b32(), svget4( zL02, 1 ), + zR01 ); + + svmopa_za64_m( 2, svptrue_b32(), svptrue_b32(), svget4( zL02, 2 ), + zR01 ); + svmopa_za64_m( 3, svptrue_b32(), svptrue_b32(), svget4( zL02, 3 ), + zR01 ); + + svmopa_za64_m( 4, svptrue_b32(), svptrue_b32(), svget4( zL03, 0 ), + zR01 ); + svmopa_za64_m( 5, svptrue_b32(), svptrue_b32(), svget4( zL03, 1 ), + zR01 ); + + svmopa_za64_m( 6, svptrue_b32(), svptrue_b32(), svget4( zL03, 2 ), + zR01 ); + svmopa_za64_m( 7, svptrue_b32(), svptrue_b32(), svget4( zL03, 3 ), + zR01 ); + + svfloat64x4_t zL04 = svld1_f64_x4( svptrue_c32(), + (float64_t*)( &b_[16 * SVL] ) ); + svfloat64x4_t zL05 = svld1_f64_x4( svptrue_c32(), + (float64_t*)( &b_[20 * SVL] ) ); + svfloat64_t zR02 = svld1_f64( svptrue_b32(), + (float64_t*)( &a_[2 * SVL] ) ); + + svmopa_za64_m( 0, svptrue_b32(), svptrue_b32(), svget4( zL04, 0 ), + zR02 ); + svmopa_za64_m( 1, svptrue_b32(), svptrue_b32(), svget4( zL04, 1 ), + zR02 ); + + svmopa_za64_m( 2, svptrue_b32(), svptrue_b32(), svget4( zL04, 2 ), + zR02 ); + svmopa_za64_m( 3, svptrue_b32(), svptrue_b32(), svget4( zL04, 3 ), + zR02 ); + + svmopa_za64_m( 4, svptrue_b32(), svptrue_b32(), svget4( zL05, 0 ), + zR02 ); + svmopa_za64_m( 5, svptrue_b32(), svptrue_b32(), svget4( zL05, 1 ), + zR02 ); + + svmopa_za64_m( 6, svptrue_b32(), svptrue_b32(), svget4( zL05, 2 ), + zR02 ); + svmopa_za64_m( 7, svptrue_b32(), svptrue_b32(), svget4( zL05, 3 ), + zR02 ); + + svfloat64x4_t zL06 = svld1_f64_x4( svptrue_c32(), + (float64_t*)( &b_[24 * SVL] ) ); + svfloat64x4_t zL07 = svld1_f64_x4( svptrue_c32(), + (float64_t*)( &b_[28 * SVL] ) ); + svfloat64_t zR03 = svld1_f64( svptrue_b32(), + (float64_t*)( &a_[3 * SVL] ) ); + + svmopa_za64_m( 0, svptrue_b32(), svptrue_b32(), svget4( zL06, 0 ), + zR03 ); + svmopa_za64_m( 1, svptrue_b32(), svptrue_b32(), svget4( zL06, 1 ), + zR03 ); + + svmopa_za64_m( 2, svptrue_b32(), svptrue_b32(), svget4( zL06, 2 ), + zR03 ); + svmopa_za64_m( 3, svptrue_b32(), svptrue_b32(), svget4( zL06, 3 ), + zR03 ); + + svmopa_za64_m( 4, svptrue_b32(), svptrue_b32(), svget4( zL07, 0 ), + zR03 ); + svmopa_za64_m( 5, svptrue_b32(), svptrue_b32(), svget4( zL07, 1 ), + zR03 ); + + svmopa_za64_m( 6, svptrue_b32(), svptrue_b32(), svget4( zL07, 2 ), + zR03 ); + svmopa_za64_m( 7, svptrue_b32(), svptrue_b32(), svget4( zL07, 3 ), + zR03 ); + + a_ += ( 4 * SVL ); + b_ += ( 4 * 8 * SVL ); + } + + for ( k_ = 0; k_ < k_left; k_ += 1 ) + { + svfloat64x4_t zL00 = svld1_f64_x4( svptrue_c32(), + (float64_t*)( &b_[0] ) ); + svfloat64x4_t zL01 = svld1_f64_x4( svptrue_c32(), + (float64_t*)( &b_[4 * SVL] ) ); + svfloat64_t zR00 = svld1_f64( svptrue_b32(), (float64_t*)( &a_[0] ) ); + + svmopa_za64_m( 0, svptrue_b32(), svptrue_b32(), svget4( zL00, 0 ), + zR00 ); + svmopa_za64_m( 1, svptrue_b32(), svptrue_b32(), svget4( zL00, 1 ), + zR00 ); + + svmopa_za64_m( 2, svptrue_b32(), svptrue_b32(), svget4( zL00, 2 ), + zR00 ); + svmopa_za64_m( 3, svptrue_b32(), svptrue_b32(), svget4( zL00, 3 ), + zR00 ); + + svmopa_za64_m( 4, svptrue_b32(), svptrue_b32(), svget4( zL01, 0 ), + zR00 ); + svmopa_za64_m( 5, svptrue_b32(), svptrue_b32(), svget4( zL01, 1 ), + zR00 ); + + svmopa_za64_m( 6, svptrue_b32(), svptrue_b32(), svget4( zL01, 2 ), + zR00 ); + svmopa_za64_m( 7, svptrue_b32(), svptrue_b32(), svget4( zL01, 3 ), + zR00 ); + + a_ += ( SVL ); + b_ += ( 8 * SVL ); + } + + double beta_ = *(double*)beta; + double alpha_ = *(double*)alpha; + + const uint64_t result_tile_TL_corner = 0; + + svfloat64_t zbeta = svdup_f64( beta_ ); + svfloat64_t zalpha = svdup_f64( alpha_ ); + + if ( cs_c == 1 ) + { + const uint64_t result_tile_TR_corner = 4 * SVL; + + if ( beta_ == 0 ) + { + for ( uint64_t tcol = 0; tcol < SVL; tcol += 4 ) + { + // Read ZA slices into Z regs + svfloat64_t z0 = svread_ver_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 0 ); + svfloat64_t z1 = svread_ver_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 0 ); + svfloat64_t z2 = svread_ver_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 0 ); + svfloat64_t z3 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 0 ); + svfloat64_t z4 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 0 ); + svfloat64_t z5 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 0 ); + svfloat64_t z6 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 0 ); + svfloat64_t z7 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 0 ); + + // Scale Z regs by broadcast alpha + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + // Store full result into C + svfloat64x4_t z400 = svcreate4( z0, z1, z2, z3 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 0 ) * rs_c], z400 ); + + svfloat64x4_t z600 = svcreate4( z4, z5, z6, z7 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 0 ) * rs_c], z600 ); + + // tcol + 1 + z0 = svread_ver_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 1 ); + z1 = svread_ver_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 1 ); + z2 = svread_ver_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 1 ); + z3 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 1 ); + z4 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 1 ); + z5 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 1 ); + z6 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 1 ); + z7 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 1 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + z400 = svcreate4( z0, z1, z2, z3 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 1 ) * rs_c], z400 ); + + z600 = svcreate4( z4, z5, z6, z7 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 1 ) * rs_c], z600 ); + + // tcol + 2 + z0 = svread_ver_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 2 ); + z1 = svread_ver_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 2 ); + z2 = svread_ver_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 2 ); + z3 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 2 ); + z4 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 2 ); + z5 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 2 ); + z6 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 2 ); + z7 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 2 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + z400 = svcreate4( z0, z1, z2, z3 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 2 ) * rs_c], z400 ); + + z600 = svcreate4( z4, z5, z6, z7 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 2 ) * rs_c], z600 ); + + // tcol + 3 + z0 = svread_ver_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 3 ); + z1 = svread_ver_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 3 ); + z2 = svread_ver_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 3 ); + z3 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 3 ); + z4 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 3 ); + z5 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 3 ); + z6 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 3 ); + z7 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 3 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + z400 = svcreate4( z0, z1, z2, z3 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 3 ) * rs_c], z400 ); + + z600 = svcreate4( z4, z5, z6, z7 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 3 ) * rs_c], z600 ); + } + } + else + { + for ( uint64_t tcol = 0; tcol < SVL; tcol += 4 ) + { + // Read ZA slices into Z regs + svfloat64_t z0 = svread_ver_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 0 ); + svfloat64_t z1 = svread_ver_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 0 ); + svfloat64_t z2 = svread_ver_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 0 ); + svfloat64_t z3 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 0 ); + svfloat64_t z4 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 0 ); + svfloat64_t z5 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 0 ); + svfloat64_t z6 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 0 ); + svfloat64_t z7 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 0 ); + + // Scale Z regs by broadcast alpha + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + // Load C into Z regs + + svfloat64x4_t zq5 = svld1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 0 ) * rs_c ) )] ); + svfloat64x4_t zq6 = svld1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( ( ( tcol + 0 ) * rs_c ) )] ); + + // Scale Z regs by broadcast beta + svfloat64_t z40 = svmla_m( svptrue_b32(), z0, svget4( zq5, 0 ), + zbeta ); + svfloat64_t z50 = svmla_m( svptrue_b32(), z1, svget4( zq5, 1 ), + zbeta ); + svfloat64_t z60 = svmla_m( svptrue_b32(), z2, svget4( zq5, 2 ), + zbeta ); + svfloat64_t z70 = svmla_m( svptrue_b32(), z3, svget4( zq5, 3 ), + zbeta ); + svfloat64_t z80 = svmla_m( svptrue_b32(), z4, svget4( zq6, 0 ), + zbeta ); + svfloat64_t z90 = svmla_m( svptrue_b32(), z5, svget4( zq6, 1 ), + zbeta ); + svfloat64_t za0 = svmla_m( svptrue_b32(), z6, svget4( zq6, 2 ), + zbeta ); + svfloat64_t zb0 = svmla_m( svptrue_b32(), z7, svget4( zq6, 3 ), + zbeta ); + + // Store full result into C + svfloat64x4_t z400 = svcreate4( z40, z50, z60, z70 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 0 ) * rs_c], z400 ); + + svfloat64x4_t z600 = svcreate4( z80, z90, za0, zb0 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 0 ) * rs_c], z600 ); + + // tcol + 1 + z0 = svread_ver_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 1 ); + z1 = svread_ver_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 1 ); + z2 = svread_ver_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 1 ); + z3 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 1 ); + z4 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 1 ); + z5 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 1 ); + z6 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 1 ); + z7 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 1 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + zq5 = svld1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 1 ) * rs_c ) )] ); + zq6 = svld1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( ( ( tcol + 1 ) * rs_c ) )] ); + + z40 = svmla_m( svptrue_b32(), z0, svget4( zq5, 0 ), zbeta ); + z50 = svmla_m( svptrue_b32(), z1, svget4( zq5, 1 ), zbeta ); + z60 = svmla_m( svptrue_b32(), z2, svget4( zq5, 2 ), zbeta ); + z70 = svmla_m( svptrue_b32(), z3, svget4( zq5, 3 ), zbeta ); + z80 = svmla_m( svptrue_b32(), z4, svget4( zq6, 0 ), zbeta ); + z90 = svmla_m( svptrue_b32(), z5, svget4( zq6, 1 ), zbeta ); + za0 = svmla_m( svptrue_b32(), z6, svget4( zq6, 2 ), zbeta ); + zb0 = svmla_m( svptrue_b32(), z7, svget4( zq6, 3 ), zbeta ); + + z400 = svcreate4( z40, z50, z60, z70 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 1 ) * rs_c], z400 ); + + z600 = svcreate4( z80, z90, za0, zb0 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 1 ) * rs_c], z600 ); + + // tcol + 2 + z0 = svread_ver_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 2 ); + z1 = svread_ver_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 2 ); + z2 = svread_ver_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 2 ); + z3 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 2 ); + z4 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 2 ); + z5 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 2 ); + z6 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 2 ); + z7 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 2 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + zq5 = svld1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 2 ) * rs_c ) )] ); + zq6 = svld1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( ( ( tcol + 2 ) * rs_c ) )] ); + + z40 = svmla_m( svptrue_b32(), z0, svget4( zq5, 0 ), zbeta ); + z50 = svmla_m( svptrue_b32(), z1, svget4( zq5, 1 ), zbeta ); + z60 = svmla_m( svptrue_b32(), z2, svget4( zq5, 2 ), zbeta ); + z70 = svmla_m( svptrue_b32(), z3, svget4( zq5, 3 ), zbeta ); + z80 = svmla_m( svptrue_b32(), z4, svget4( zq6, 0 ), zbeta ); + z90 = svmla_m( svptrue_b32(), z5, svget4( zq6, 1 ), zbeta ); + za0 = svmla_m( svptrue_b32(), z6, svget4( zq6, 2 ), zbeta ); + zb0 = svmla_m( svptrue_b32(), z7, svget4( zq6, 3 ), zbeta ); + + z400 = svcreate4( z40, z50, z60, z70 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 2 ) * rs_c], z400 ); + + z600 = svcreate4( z80, z90, za0, zb0 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 2 ) * rs_c], z600 ); + + // tcol + 3 + z0 = svread_ver_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 3 ); + z1 = svread_ver_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 3 ); + z2 = svread_ver_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 3 ); + z3 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 3 ); + z4 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 3 ); + z5 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 3 ); + z6 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 3 ); + z7 = svread_ver_za64_m( z3, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 3 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + zq5 = svld1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 3 ) * rs_c ) )] ); + zq6 = svld1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( ( ( tcol + 3 ) * rs_c ) )] ); + + z40 = svmla_m( svptrue_b32(), z0, svget4( zq5, 0 ), zbeta ); + z50 = svmla_m( svptrue_b32(), z1, svget4( zq5, 1 ), zbeta ); + z60 = svmla_m( svptrue_b32(), z2, svget4( zq5, 2 ), zbeta ); + z70 = svmla_m( svptrue_b32(), z3, svget4( zq5, 3 ), zbeta ); + z80 = svmla_m( svptrue_b32(), z4, svget4( zq6, 0 ), zbeta ); + z90 = svmla_m( svptrue_b32(), z5, svget4( zq6, 1 ), zbeta ); + za0 = svmla_m( svptrue_b32(), z6, svget4( zq6, 2 ), zbeta ); + zb0 = svmla_m( svptrue_b32(), z7, svget4( zq6, 3 ), zbeta ); + + z400 = svcreate4( z40, z50, z60, z70 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TL_corner + ( tcol + 3 ) * rs_c], z400 ); + + z600 = svcreate4( z80, z90, za0, zb0 ); + svst1_f64_x4( svptrue_c32(), + &c_[result_tile_TR_corner + ( tcol + 3 ) * rs_c], z600 ); + } + } + } + else + { + const uint64_t result_tile_1 = SVL * cs_c; + const uint64_t result_tile_2 = SVL * 2 * cs_c; + const uint64_t result_tile_3 = SVL * 3 * cs_c; + const uint64_t result_tile_4 = SVL * 4 * cs_c; + const uint64_t result_tile_5 = SVL * 5 * cs_c; + const uint64_t result_tile_6 = SVL * 6 * cs_c; + const uint64_t result_tile_7 = SVL * 7 * cs_c; + + if ( beta_ == 0 ) + { + for ( uint64_t tcol = 0; tcol < SVL; tcol += 4 ) + { + // Read ZA slices into Z regs + svfloat64_t z0 = svread_hor_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 0 ); + svfloat64_t z1 = svread_hor_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 0 ); + svfloat64_t z2 = svread_hor_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 0 ); + svfloat64_t z3 = svread_hor_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 0 ); + svfloat64_t z4 = svread_hor_za64_m( z4, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 0 ); + svfloat64_t z5 = svread_hor_za64_m( z5, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 0 ); + svfloat64_t z6 = svread_hor_za64_m( z6, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 0 ); + svfloat64_t z7 = svread_hor_za64_m( z7, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 0 ); + + // Scale Z regs by broadcast alpha + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + // Store full result into C + svst1_f64( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 0 ) * cs_c], z0 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_1 + ( tcol + 0 ) * cs_c], z1 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_2 + ( tcol + 0 ) * cs_c], z2 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_3 + ( tcol + 0 ) * cs_c], z3 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_4 + ( tcol + 0 ) * cs_c], z4 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_5 + ( tcol + 0 ) * cs_c], z5 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_6 + ( tcol + 0 ) * cs_c], z6 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_7 + ( tcol + 0 ) * cs_c], z7 ); + + // tcol + 1 + z0 = svread_hor_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 1 ); + z1 = svread_hor_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 1 ); + z2 = svread_hor_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 1 ); + z3 = svread_hor_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 1 ); + z4 = svread_hor_za64_m( z4, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 1 ); + z5 = svread_hor_za64_m( z5, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 1 ); + z6 = svread_hor_za64_m( z6, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 1 ); + z7 = svread_hor_za64_m( z7, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 1 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + svst1_f64( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 1 ) * cs_c], z0 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_1 + ( tcol + 1 ) * cs_c], z1 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_2 + ( tcol + 1 ) * cs_c], z2 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_3 + ( tcol + 1 ) * cs_c], z3 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_4 + ( tcol + 1 ) * cs_c], z4 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_5 + ( tcol + 1 ) * cs_c], z5 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_6 + ( tcol + 1 ) * cs_c], z6 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_7 + ( tcol + 1 ) * cs_c], z7 ); + + // tcol + 2 + z0 = svread_hor_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 2 ); + z1 = svread_hor_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 2 ); + z2 = svread_hor_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 2 ); + z3 = svread_hor_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 2 ); + z4 = svread_hor_za64_m( z4, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 2 ); + z5 = svread_hor_za64_m( z5, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 2 ); + z6 = svread_hor_za64_m( z6, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 2 ); + z7 = svread_hor_za64_m( z7, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 2 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + svst1_f64( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 2 ) * cs_c], z0 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_1 + ( tcol + 2 ) * cs_c], z1 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_2 + ( tcol + 2 ) * cs_c], z2 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_3 + ( tcol + 2 ) * cs_c], z3 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_4 + ( tcol + 2 ) * cs_c], z4 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_5 + ( tcol + 2 ) * cs_c], z5 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_6 + ( tcol + 2 ) * cs_c], z6 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_7 + ( tcol + 2 ) * cs_c], z7 ); + + // tcol + 3 + z0 = svread_hor_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 3 ); + z1 = svread_hor_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 3 ); + z2 = svread_hor_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 3 ); + z3 = svread_hor_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 3 ); + z4 = svread_hor_za64_m( z4, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 3 ); + z5 = svread_hor_za64_m( z5, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 3 ); + z6 = svread_hor_za64_m( z6, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 3 ); + z7 = svread_hor_za64_m( z7, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 3 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + svst1_f64( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 3 ) * cs_c], z0 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_1 + ( tcol + 3 ) * cs_c], z1 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_2 + ( tcol + 3 ) * cs_c], z2 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_3 + ( tcol + 3 ) * cs_c], z3 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_4 + ( tcol + 3 ) * cs_c], z4 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_5 + ( tcol + 3 ) * cs_c], z5 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_6 + ( tcol + 3 ) * cs_c], z6 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_7 + ( tcol + 3 ) * cs_c], z7 ); + } + } + else + { + for ( uint64_t tcol = 0; tcol < SVL; tcol += 4 ) + { + // Read ZA slices into Z regs + svfloat64_t z0 = svread_hor_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 0 ); + svfloat64_t z1 = svread_hor_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 0 ); + svfloat64_t z2 = svread_hor_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 0 ); + svfloat64_t z3 = svread_hor_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 0 ); + svfloat64_t z4 = svread_hor_za64_m( z4, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 0 ); + svfloat64_t z5 = svread_hor_za64_m( z5, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 0 ); + svfloat64_t z6 = svread_hor_za64_m( z6, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 0 ); + svfloat64_t z7 = svread_hor_za64_m( z7, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 0 ); + + // Scale Z regs by broadcast alpha + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + // Load C into Z regs + svfloat64_t zq0 = svld1_f64( svptrue_b32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 0 ) * cs_c ) )] ); + svfloat64_t zq1 = svld1_f64( svptrue_b32(), + &c_[result_tile_1 + ( ( ( tcol + 0 ) * cs_c ) )] ); + svfloat64_t zq2 = svld1_f64( svptrue_b32(), + &c_[result_tile_2 + ( ( ( tcol + 0 ) * cs_c ) )] ); + svfloat64_t zq3 = svld1_f64( svptrue_b32(), + &c_[result_tile_3 + ( ( ( tcol + 0 ) * cs_c ) )] ); + svfloat64_t zq4 = svld1_f64( svptrue_b32(), + &c_[result_tile_4 + ( ( ( tcol + 0 ) * cs_c ) )] ); + svfloat64_t zq5 = svld1_f64( svptrue_b32(), + &c_[result_tile_5 + ( ( ( tcol + 0 ) * cs_c ) )] ); + svfloat64_t zq6 = svld1_f64( svptrue_b32(), + &c_[result_tile_6 + ( ( ( tcol + 0 ) * cs_c ) )] ); + svfloat64_t zq7 = svld1_f64( svptrue_b32(), + &c_[result_tile_7 + ( ( ( tcol + 0 ) * cs_c ) )] ); + + // Scale Z regs by broadcast beta + svfloat64_t z00 = svmla_m( svptrue_b32(), z0, zq0, zbeta ); + svfloat64_t z10 = svmla_m( svptrue_b32(), z1, zq1, zbeta ); + svfloat64_t z20 = svmla_m( svptrue_b32(), z2, zq2, zbeta ); + svfloat64_t z30 = svmla_m( svptrue_b32(), z3, zq3, zbeta ); + svfloat64_t z40 = svmla_m( svptrue_b32(), z4, zq4, zbeta ); + svfloat64_t z50 = svmla_m( svptrue_b32(), z5, zq5, zbeta ); + svfloat64_t z60 = svmla_m( svptrue_b32(), z6, zq6, zbeta ); + svfloat64_t z70 = svmla_m( svptrue_b32(), z7, zq7, zbeta ); + + // Store full result into C + svst1_f64( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 0 ) * cs_c], z00 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_1 + ( tcol + 0 ) * cs_c], z10 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_2 + ( tcol + 0 ) * cs_c], z20 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_3 + ( tcol + 0 ) * cs_c], z30 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_4 + ( tcol + 0 ) * cs_c], z40 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_5 + ( tcol + 0 ) * cs_c], z50 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_6 + ( tcol + 0 ) * cs_c], z60 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_7 + ( tcol + 0 ) * cs_c], z70 ); + + // tcol + 1 + z0 = svread_hor_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 1 ); + z1 = svread_hor_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 1 ); + z2 = svread_hor_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 1 ); + z3 = svread_hor_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 1 ); + z4 = svread_hor_za64_m( z4, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 1 ); + z5 = svread_hor_za64_m( z5, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 1 ); + z6 = svread_hor_za64_m( z6, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 1 ); + z7 = svread_hor_za64_m( z7, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 1 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + zq0 = svld1_f64( svptrue_b32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 1 ) * cs_c ) )] ); + zq1 = svld1_f64( svptrue_b32(), + &c_[result_tile_1 + ( ( ( tcol + 1 ) * cs_c ) )] ); + zq2 = svld1_f64( svptrue_b32(), + &c_[result_tile_2 + ( ( ( tcol + 1 ) * cs_c ) )] ); + zq3 = svld1_f64( svptrue_b32(), + &c_[result_tile_3 + ( ( ( tcol + 1 ) * cs_c ) )] ); + zq4 = svld1_f64( svptrue_b32(), + &c_[result_tile_4 + ( ( ( tcol + 1 ) * cs_c ) )] ); + zq5 = svld1_f64( svptrue_b32(), + &c_[result_tile_5 + ( ( ( tcol + 1 ) * cs_c ) )] ); + zq6 = svld1_f64( svptrue_b32(), + &c_[result_tile_6 + ( ( ( tcol + 1 ) * cs_c ) )] ); + zq7 = svld1_f64( svptrue_b32(), + &c_[result_tile_7 + ( ( ( tcol + 1 ) * cs_c ) )] ); + + z00 = svmla_m( svptrue_b32(), z0, zq0, zbeta ); + z10 = svmla_m( svptrue_b32(), z1, zq1, zbeta ); + z20 = svmla_m( svptrue_b32(), z2, zq2, zbeta ); + z30 = svmla_m( svptrue_b32(), z3, zq3, zbeta ); + z40 = svmla_m( svptrue_b32(), z4, zq4, zbeta ); + z50 = svmla_m( svptrue_b32(), z5, zq5, zbeta ); + z60 = svmla_m( svptrue_b32(), z6, zq6, zbeta ); + z70 = svmla_m( svptrue_b32(), z7, zq7, zbeta ); + + svst1_f64( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 1 ) * cs_c], z00 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_1 + ( tcol + 1 ) * cs_c], z10 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_2 + ( tcol + 1 ) * cs_c], z20 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_3 + ( tcol + 1 ) * cs_c], z30 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_4 + ( tcol + 1 ) * cs_c], z40 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_5 + ( tcol + 1 ) * cs_c], z50 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_6 + ( tcol + 1 ) * cs_c], z60 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_7 + ( tcol + 1 ) * cs_c], z70 ); + + // tcol + 2 + z0 = svread_hor_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 2 ); + z1 = svread_hor_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 2 ); + z2 = svread_hor_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 2 ); + z3 = svread_hor_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 2 ); + z4 = svread_hor_za64_m( z4, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 2 ); + z5 = svread_hor_za64_m( z5, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 2 ); + z6 = svread_hor_za64_m( z6, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 2 ); + z7 = svread_hor_za64_m( z7, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 2 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + zq0 = svld1_f64( svptrue_b32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 2 ) * cs_c ) )] ); + zq1 = svld1_f64( svptrue_b32(), + &c_[result_tile_1 + ( ( ( tcol + 2 ) * cs_c ) )] ); + zq2 = svld1_f64( svptrue_b32(), + &c_[result_tile_2 + ( ( ( tcol + 2 ) * cs_c ) )] ); + zq3 = svld1_f64( svptrue_b32(), + &c_[result_tile_3 + ( ( ( tcol + 2 ) * cs_c ) )] ); + zq4 = svld1_f64( svptrue_b32(), + &c_[result_tile_4 + ( ( ( tcol + 2 ) * cs_c ) )] ); + zq5 = svld1_f64( svptrue_b32(), + &c_[result_tile_5 + ( ( ( tcol + 2 ) * cs_c ) )] ); + zq6 = svld1_f64( svptrue_b32(), + &c_[result_tile_6 + ( ( ( tcol + 2 ) * cs_c ) )] ); + zq7 = svld1_f64( svptrue_b32(), + &c_[result_tile_7 + ( ( ( tcol + 2 ) * cs_c ) )] ); + + z00 = svmla_m( svptrue_b32(), z0, zq0, zbeta ); + z10 = svmla_m( svptrue_b32(), z1, zq1, zbeta ); + z20 = svmla_m( svptrue_b32(), z2, zq2, zbeta ); + z30 = svmla_m( svptrue_b32(), z3, zq3, zbeta ); + z40 = svmla_m( svptrue_b32(), z4, zq4, zbeta ); + z50 = svmla_m( svptrue_b32(), z5, zq5, zbeta ); + z60 = svmla_m( svptrue_b32(), z6, zq6, zbeta ); + z70 = svmla_m( svptrue_b32(), z7, zq7, zbeta ); + + svst1_f64( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 2 ) * cs_c], z00 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_1 + ( tcol + 2 ) * cs_c], z10 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_2 + ( tcol + 2 ) * cs_c], z20 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_3 + ( tcol + 2 ) * cs_c], z30 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_4 + ( tcol + 2 ) * cs_c], z40 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_5 + ( tcol + 2 ) * cs_c], z50 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_6 + ( tcol + 2 ) * cs_c], z60 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_7 + ( tcol + 2 ) * cs_c], z70 ); + + // tcol + 3 + z0 = svread_hor_za64_m( z0, svptrue_b32(), + /* tile: */ 0, /* slice: */ tcol + 3 ); + z1 = svread_hor_za64_m( z1, svptrue_b32(), + /* tile: */ 1, /* slice: */ tcol + 3 ); + z2 = svread_hor_za64_m( z2, svptrue_b32(), + /* tile: */ 2, /* slice: */ tcol + 3 ); + z3 = svread_hor_za64_m( z3, svptrue_b32(), + /* tile: */ 3, /* slice: */ tcol + 3 ); + z4 = svread_hor_za64_m( z4, svptrue_b32(), + /* tile: */ 4, /* slice: */ tcol + 3 ); + z5 = svread_hor_za64_m( z5, svptrue_b32(), + /* tile: */ 5, /* slice: */ tcol + 3 ); + z6 = svread_hor_za64_m( z6, svptrue_b32(), + /* tile: */ 6, /* slice: */ tcol + 3 ); + z7 = svread_hor_za64_m( z7, svptrue_b32(), + /* tile: */ 7, /* slice: */ tcol + 3 ); + + z0 = svmul_f64_z( svptrue_b32(), z0, zalpha ); + z1 = svmul_f64_z( svptrue_b32(), z1, zalpha ); + z2 = svmul_f64_z( svptrue_b32(), z2, zalpha ); + z3 = svmul_f64_z( svptrue_b32(), z3, zalpha ); + z4 = svmul_f64_z( svptrue_b32(), z4, zalpha ); + z5 = svmul_f64_z( svptrue_b32(), z5, zalpha ); + z6 = svmul_f64_z( svptrue_b32(), z6, zalpha ); + z7 = svmul_f64_z( svptrue_b32(), z7, zalpha ); + + zq0 = svld1_f64( svptrue_b32(), + &c_[result_tile_TL_corner + ( ( ( tcol + 3 ) * cs_c ) )] ); + zq1 = svld1_f64( svptrue_b32(), + &c_[result_tile_1 + ( ( ( tcol + 3 ) * cs_c ) )] ); + zq2 = svld1_f64( svptrue_b32(), + &c_[result_tile_2 + ( ( ( tcol + 3 ) * cs_c ) )] ); + zq3 = svld1_f64( svptrue_b32(), + &c_[result_tile_3 + ( ( ( tcol + 3 ) * cs_c ) )] ); + zq4 = svld1_f64( svptrue_b32(), + &c_[result_tile_4 + ( ( ( tcol + 3 ) * cs_c ) )] ); + zq5 = svld1_f64( svptrue_b32(), + &c_[result_tile_5 + ( ( ( tcol + 3 ) * cs_c ) )] ); + zq6 = svld1_f64( svptrue_b32(), + &c_[result_tile_6 + ( ( ( tcol + 3 ) * cs_c ) )] ); + zq7 = svld1_f64( svptrue_b32(), + &c_[result_tile_7 + ( ( ( tcol + 3 ) * cs_c ) )] ); + + z00 = svmla_m( svptrue_b32(), z0, zq0, zbeta ); + z10 = svmla_m( svptrue_b32(), z1, zq1, zbeta ); + z20 = svmla_m( svptrue_b32(), z2, zq2, zbeta ); + z30 = svmla_m( svptrue_b32(), z3, zq3, zbeta ); + z40 = svmla_m( svptrue_b32(), z4, zq4, zbeta ); + z50 = svmla_m( svptrue_b32(), z5, zq5, zbeta ); + z60 = svmla_m( svptrue_b32(), z6, zq6, zbeta ); + z70 = svmla_m( svptrue_b32(), z7, zq7, zbeta ); + + svst1_f64( svptrue_b32(), + &c_[result_tile_TL_corner + ( tcol + 3 ) * cs_c], z00 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_1 + ( tcol + 3 ) * cs_c], z10 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_2 + ( tcol + 3 ) * cs_c], z20 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_3 + ( tcol + 3 ) * cs_c], z30 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_4 + ( tcol + 3 ) * cs_c], z40 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_5 + ( tcol + 3 ) * cs_c], z50 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_6 + ( tcol + 3 ) * cs_c], z60 ); + svst1_f64( svptrue_b32(), + &c_[result_tile_7 + ( tcol + 3 ) * cs_c], z70 ); + } + } + } + + GEMM_UKR_FLUSH_CT( d ); + + return; +} + diff --git a/kernels/armsme/bli_kernels_armsme.h b/kernels/armsme/bli_kernels_armsme.h new file mode 100644 index 000000000..f9bd775ca --- /dev/null +++ b/kernels/armsme/bli_kernels_armsme.h @@ -0,0 +1,137 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +PACKM_KER_PROT( float, s, packm_armsme_int_2SVLx2SVL ) +PACKM_KER_PROT( double, d, packm_armsme_int_4SVLx2SVL ) + +__arm_new( "za" ) __arm_locally_streaming void bli_sgemm_armsme_int_2SVLx2SVL + ( + dim_t m, + dim_t n, + dim_t k, + const void* alpha, + const void* a, + const void* b, + const void* beta, + void* c, inc_t rs_c0, inc_t cs_c0, + const auxinfo_t* data, + const cntx_t* cntx + ) ; + +__arm_new( "za" ) __arm_locally_streaming void bli_dgemm_armsme_int_4SVLx2SVL + ( + dim_t m, + dim_t n, + dim_t k, + const void* alpha, + const void* a, + const void* b, + const void* beta, + void* c, inc_t rs_c, inc_t cs_c, + const auxinfo_t* data, + const cntx_t* cntx + ); + +PACKM_KER_PROT( float, s, packm_armsme_int_SVLx4SVL ) +PACKM_KER_PROT( double, d, packm_armsme_int_SVLx8SVL ) + +__arm_new( "za" ) __arm_locally_streaming void bli_sgemm_armsme_int_SVLx4SVL + ( + dim_t m, + dim_t n, + dim_t k, + const void* alpha, + const void* a, + const void* b, + const void* beta, + void* c, inc_t rs_c0, inc_t cs_c0, + const auxinfo_t* data, + const cntx_t* cntx + ) ; + +__arm_new( "za" ) __arm_locally_streaming void bli_dgemm_armsme_int_SVLx8SVL + ( + dim_t m, + dim_t n, + dim_t k, + const void* alpha, + const void* a, + const void* b, + const void* beta, + void* c, inc_t rs_c, inc_t cs_c, + const auxinfo_t* data, + const cntx_t* cntx + ); + +__arm_new("za") __arm_locally_streaming void bli_sgemm_armsme_int_4SVLxSVL + ( + dim_t m, + dim_t n, + dim_t k, + const void* alpha, + const void* a, + const void* b, + const void* beta, + void* c, inc_t rs_c0, inc_t cs_c0, + const auxinfo_t* data, + const cntx_t* cntx + ) ; + +__arm_new( "za" ) __arm_locally_streaming void bli_dgemm_armsme_int_8SVLxSVL + ( + dim_t m, + dim_t n, + dim_t k, + const void* alpha, + const void* a, + const void* b, + const void* beta, + void* c, inc_t rs_c, inc_t cs_c, + const auxinfo_t* data, + const cntx_t* cntx + ); + +__arm_new( "za" ) __arm_locally_streaming void bli_dgemm_armsme_int_2SVLx4SVL + ( + dim_t m, + dim_t n, + dim_t k, + const void* alpha, + const void* a, + const void* b, + const void* beta, + void* c, inc_t rs_c, inc_t cs_c, + const auxinfo_t* data, + const cntx_t* cntx + );