numpy  2.0.0
src/umath/simd.inc.src File Reference
#include "lowlevel_strided_loops.h"
#include "numpy/npy_common.h"
#include "numpy/ufuncobject.h"
#include "numpy/npy_math.h"
#include <assert.h>
#include <stdlib.h>
#include <float.h>
#include <string.h>

Defines

#define __NPY_SIMD_INC
#define IS_BLOCKABLE_UNARY(esize, vsize)
#define IS_BLOCKABLE_REDUCE(esize, vsize)
#define IS_BLOCKABLE_BINARY(esize, vsize)
#define IS_BLOCKABLE_BINARY_SCALAR1(esize, vsize)
#define IS_BLOCKABLE_BINARY_SCALAR2(esize, vsize)
#define IS_BLOCKABLE_BINARY_BOOL(esize, vsize)
#define IS_BLOCKABLE_BINARY_SCALAR1_BOOL(esize, vsize)
#define IS_BLOCKABLE_BINARY_SCALAR2_BOOL(esize, vsize)
#define LOOP_BLOCK_ALIGN_VAR(var, type, alignment)
#define LOOP_BLOCKED(type, vsize)
#define LOOP_BLOCKED_END   for (; i < n; i++)

Functions

static NPY_INLINE npy_intp abs_intp (npy_intp x)
static NPY_INLINE int run_
name _simd_ func _ 
TYPE (char **args, npy_intp *dimensions, npy_intp *steps)
static NPY_INLINE int
run_binary_simd_ kind 
_BOOL (char **args, npy_intp *dimensions, npy_intp *steps)

Define Documentation

#define __NPY_SIMD_INC
This file is for the definitions of simd vectorized operations.
Currently contains sse2 functions that are built on amd64, x32 or non-generic builds (CFLAGS=-march=...) In future it may contain other instruction sets like AVX or NEON detected at runtime in which case it needs to be included indirectly via a file compiled with special options (or use gcc target attributes) so the binary stays portable.
#define IS_BLOCKABLE_BINARY (   esize,
  vsize 
)
Value:
(steps[0] == steps[1] && steps[1] == steps[2] && steps[2] == (esize) && \
     npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[1], (esize)) && \
     npy_is_aligned(args[0], (esize)) && \
     (abs_intp(args[2] - args[0]) >= (vsize) || \
      abs_intp(args[2] - args[0]) == 0) && \
     (abs_intp(args[2] - args[1]) >= (vsize) || \
      abs_intp(args[2] - args[1]) >= 0))

Referenced by TYPE().

#define IS_BLOCKABLE_BINARY_BOOL (   esize,
  vsize 
)
Value:
(steps[0] == (esize) && steps[0] == steps[1] && steps[2] == (1) && \
     npy_is_aligned(args[1], (esize)) && \
     npy_is_aligned(args[0], (esize)))
#define IS_BLOCKABLE_BINARY_SCALAR1 (   esize,
  vsize 
)
Value:
(steps[0] == 0 && steps[1] == steps[2] && steps[2] == (esize) && \
     npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[1], (esize)) && \
     ((abs_intp(args[2] - args[1]) >= (vsize)) || \
      (abs_intp(args[2] - args[1]) == 0)) && \
     abs_intp(args[2] - args[0]) >= (esize))

Referenced by TYPE().

#define IS_BLOCKABLE_BINARY_SCALAR1_BOOL (   esize,
  vsize 
)
Value:
(steps[0] == 0 && steps[1] == (esize) && steps[2] == (1) && \
     npy_is_aligned(args[1], (esize)))
#define IS_BLOCKABLE_BINARY_SCALAR2 (   esize,
  vsize 
)
Value:
(steps[1] == 0 && steps[0] == steps[2] && steps[2] == (esize) && \
     npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[0], (esize)) && \
     ((abs_intp(args[2] - args[0]) >= (vsize)) || \
      (abs_intp(args[2] - args[0]) == 0)) && \
     abs_intp(args[2] - args[1]) >= (esize))

Referenced by TYPE().

#define IS_BLOCKABLE_BINARY_SCALAR2_BOOL (   esize,
  vsize 
)
Value:
(steps[0] == (esize) && steps[1] == 0 && steps[2] == (1) && \
     npy_is_aligned(args[0], (esize)))
#define IS_BLOCKABLE_REDUCE (   esize,
  vsize 
)
Value:
(steps[1] == (esize) && abs_intp(args[1] - args[0]) >= (vsize) && \
     npy_is_aligned(args[1], (esize)) && \
     npy_is_aligned(args[0], (esize)))
#define IS_BLOCKABLE_UNARY (   esize,
  vsize 
)
Value:
(steps[0] == (esize) && steps[0] == steps[1] && \
     (npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \
     ((abs_intp(args[1] - args[0]) >= (vsize)) || \
      ((abs_intp(args[1] - args[0]) == 0))))
stride is equal to element size and input and destination are equal or don't overlap within one register
#define LOOP_BLOCK_ALIGN_VAR (   var,
  type,
  alignment 
)
Value:
npy_intp i, peel = npy_aligned_block_offset(var, sizeof(type),\
                                                alignment, n);\
    for(i = 0; i < peel; i++)
align var to alignment

Referenced by _BOOL().

#define LOOP_BLOCKED (   type,
  vsize 
)
Value:
for(; i < npy_blocked_end(peel, sizeof(type), vsize, n);\
            i += (vsize / sizeof(type)))

Referenced by _BOOL().

#define LOOP_BLOCKED_END   for (; i < n; i++)

Referenced by _BOOL().


Function Documentation

static NPY_INLINE int run_unary_simd_ kind _BOOL ( char **  args,
npy_intp dimensions,
npy_intp steps 
) [static]
end repeat1*
end repeat*

* BOOL DISPATCHERS

begin repeat
# kind = logical_or, logical_and#
end repeat*
begin repeat
# kind = absolute, logical_not#

References c, LOOP_BLOCK_ALIGN_VAR, LOOP_BLOCKED, LOOP_BLOCKED_END, npy_is_aligned(), and OP.

Referenced by PyUFunc_On_Om().

static NPY_INLINE npy_intp abs_intp ( npy_intp  x) [static]
for NO_FLOATING_POINT_SUPPORT

<

for memcpy
Figure out the right abs function for pointer addresses
static NPY_INLINE int run_ kind _simd_ TYPE ( char **  args,
npy_intp dimensions,
npy_intp steps 
) [static]
Dispatcher functions decide whether the operation can be vectorized and run it if it was run returns true and false if nothing was done

* FLOAT DISPATCHERS

begin repeat
Float types
#type = npy_float, npy_double, npy_longdouble# TYPE = FLOAT, DOUBLE, LONGDOUBLE# #vector = 1, 1, 0#
begin repeat1
#func = sqrt, absolute, negative, minimum, maximum# #check = IS_BLOCKABLE_UNARY*3, IS_BLOCKABLE_REDUCE*2 # name = unary*3, unary_reduce*2# minmax = 0*3, 1*2#
end repeat1*
begin repeat1
Arithmetic # kind = add, subtract, multiply, divide#
end repeat1*
begin repeat1
kind = equal, not_equal, less, less_equal, greater, greater_equal,
logical_and, logical_or#
System Message: WARNING/2 (<string>, line 4) Definition list ends without a blank line; unexpected unindent.

#simd = 1, 1, 1, 1, 1, 1, 0, 0#

end repeat1*
begin repeat1
kind = isnan, isfinite, isinf, signbit#

References dimensions, IS_BLOCKABLE_BINARY, IS_BLOCKABLE_BINARY_SCALAR1, IS_BLOCKABLE_BINARY_SCALAR2, and kind().