Not sure if there's a way around looping over at least one DataFrame
, but here's one option that might speed things up. It does allow for the accidental comparison of FirstName with LastName, though that can be avoided by adding a unique prefix to the values (like '@' for first name and '&' for last name)
import numpy as np
s1 = [set(x) for x in df1.values]
s2 = [set(x) for x in df2.values]
masks = np.reshape([len(x & y) >= 3 for x in s1 for y in s2], (len(df1), -1))
concat_all = [df2[m] for m in masks]
Output concat_all
[ FirstName LastName Birthday ResidenceZip
0 John Doe 1/1/2000 99999
1 John Doe 1/1/2000 99999
2 John Doex 1/1/2000 99999,
FirstName LastName Birthday ResidenceZip
5 Rob A 9/9/2009 19499]
Timings
def Alollz(df1, df2):
s1 = [set(x) for x in df1.values]
s2 = [set(x) for x in df2.values]
masks = np.reshape([len(x & y) >= 3 for x in s1 for y in s2], (len(df1), -1))
concat_all = [df2[m] for m in masks]
return concat_all
def SharpObject(df1, df2):
concat_all = []
for i, row in df1.iterrows():
c = {'ResidenceZip': row['ResidenceZip'], 'FirstName':row['FirstName'],
'LastName': row['LastName'],'Birthday': row['Birthday']}
df2['count'] = df2.apply(lambda x: partialMatch(x, c), axis = 1)
x1 = df2[df2['count']>=3]
concat_all.append(x1)
return concat_all
%timeit Alollz(df1, df2)
#785 μs ± 5.26 μs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%timeit SharpObject(df1, df2)
#3.56 ms ± 44.7 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
And larger:
# you should never append dfs like this in a loop
for i in range(7):
df1 = df1.append(df1)
df2 = df2.append(df2)
%timeit Alollz(df1, df2)
#132 ms ± 248 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit SharpObject(df1, df2)
#6.88 s ± 11.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…