Use at least Numba, or a Fortran Implementation
Both of your functions are quite slow. Python loops are very inefficient (A1), and allocating large temporary arrays is also slow (A2 and partially also A1).
Naive Numba implementation for small arrays
import numba as nb
import numpy as np
@nb.njit(parallel=True, fastmath=True)
def A_nb_p(x,y):
z=np.empty(x.shape[0])
for i in nb.prange(x.shape[0]):
TMP=0.
for j in range(y.shape[0]):
TMP+=(x[i]-y[j])**2
z[i]=TMP
return z
Timings
import time
N=int(1e5)
a=np.random.rand(N)
b=10*(np.random.rand(N)-0.5)
t1=time.time()
res_1=A1(a,b)
print(time.time()-t1)
#95.30195426940918 s
t1=time.time()
res_2=A_nb_p(a,b)
print(time.time()-t1)
#0.28573083877563477 s
#A2 is too slow to measure
As written above this is a naive implementation on larger arrays, since Numba isn't able to do the calculation blockwise, which leads to a lot of cache misses and therefore bad performance. Some Fortran or C- compiler should be able to do at least the following optimization (block-wise evaluation) automatically.
Implementation for large arrays
@nb.njit(parallel=True, fastmath=True)
def A_nb_p_2(x,y):
blk_s=1024
z=np.zeros(x.shape[0])
num_blk_x=x.shape[0]//blk_s
num_blk_y=y.shape[0]//blk_s
for ii in nb.prange(num_blk_x):
for jj in range(num_blk_y):
for i in range(blk_s):
TMP=z[ii*blk_s+i]
for j in range(blk_s):
TMP+=(x[ii*blk_s+i]-y[jj*blk_s+j])**2
z[ii*blk_s+i]=TMP
for i in nb.prange(x.shape[0]):
TMP=z[i]
for j in range(num_blk_y*blk_s,y.shape[0]):
TMP+=(x[i]-y[j])**2
z[i]=TMP
for i in nb.prange(num_blk_x*blk_s,x.shape[0]):
TMP=z[i]
for j in range(num_blk_y*blk_s):
TMP+=(x[i]-y[j])**2
z[i]=TMP
return z
Timings
N=int(2*1e6)
a=np.random.rand(N)
b=10*(np.random.rand(N)-0.5)
t1=time.time()
res_1=A_nb_p(a,b)
print(time.time()-t1)
#298.9394392967224
t1=time.time()
res_2=A_nb_p_2(a,b)
print(time.time()-t1)
#70.12
与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…