Here's another way to perform :
(a-b)^2 = a^2 + b^2 - 2ab
with np.einsum
for the first two terms and dot-product
for the third one -
import numpy as np
np.einsum('ij,ij->i',A,A)[:,None] + np.einsum('ij,ij->i',B,B) - 2*np.dot(A,B.T)
Runtime test
Approaches -
def loopy_app(A,B):
m,n = A.shape[0], B.shape[0]
out = np.empty((m,n))
for i,a in enumerate(A):
out[i] = np.sum((a - B)**2,1)
return out
def broadcasting_app(A,B):
return ((A[:,np.newaxis,:] - B)**2).sum(-1)
# @Paul Panzer's soln
def outer_sum_dot_app(A,B):
return np.add.outer((A*A).sum(axis=-1), (B*B).sum(axis=-1)) - 2*np.dot(A,B.T)
# @Daniel Forsman's soln
def einsum_all_app(A,B):
return np.einsum('ijk,ijk->ij', A[:,None,:] - B[None,:,:],
A[:,None,:] - B[None,:,:])
# Proposed in this post
def outer_einsum_dot_app(A,B):
return np.einsum('ij,ij->i',A,A)[:,None] + np.einsum('ij,ij->i',B,B) -
2*np.dot(A,B.T)
Timings -
In [51]: A = np.random.randn(1000,100)
...: B = np.random.randn(1000,100)
...:
In [52]: %timeit loopy_app(A,B)
...: %timeit broadcasting_app(A,B)
...: %timeit outer_sum_dot_app(A,B)
...: %timeit einsum_all_app(A,B)
...: %timeit outer_einsum_dot_app(A,B)
...:
10 loops, best of 3: 136 ms per loop
1 loops, best of 3: 302 ms per loop
100 loops, best of 3: 8.51 ms per loop
1 loops, best of 3: 341 ms per loop
100 loops, best of 3: 8.38 ms per loop