The code is kinda lengthy, but it does exactly what you are expecting.
import re
import pandas as pd
#from fuzzywuzzy import fuzz, process
class CustomMatcher:
def add_space_before_numbers(self, text):
return (re.sub(r'([0-9.]+)', r' 1', text)).replace(' ', ' ')
def add_space_before_numbers(self, text):
return re.sub(r'([0-9.]+)', r' 1', text)
def add_space_after_numbers(self, text):
return re.sub(r'([0-9.]+)([^0-9.])', r'1 2', text)
def pad_spaces(self, text):
result = self.add_space_before_numbers(text)
result = self.add_space_after_numbers(result)
return result.replace(' ', ' ')
def partial_word_score(self, word1, word2):
score = 0
len1 = len(word1)
len2 = len(word2)
if len2 > len1:
temp = word2
word2 = [*word1]
word1 = [*temp]
else:
word2 = [*word2]
word1 = [*word1]
for i, char in enumerate(word2):
if word1[i] == char:
score = score + 1
if min(len1, len2) != 0:
return (score*100) / min(len1, len2)
else:
return 0
def match(self, comparand, target):
len_c = len(comparand)
len_t = len(target)
comparand_words = self.pad_spaces(comparand.lower()).split(' ')
target_words = self.pad_spaces(target.lower()).split(' ')
complete_score = 0
for t_word in target_words:
for c_word in comparand_words:
len1 = len(t_word)
len2 = len(c_word)
word_score = self.partial_word_score(t_word, c_word)
* (min(len1, len2) / min(len_c, len_t))
complete_score = complete_score + word_score
return complete_score
search_array = [
{'column1': 'ryzen 5 5600'},
{'column1': 'ram 8 gb ddr4 3.2ghz'},
{'column2': 'SSD 220gb'},
{'column3': 'windows 10 prof'},
{'column2': 'ryzen 5 3600'},
{'column1': 'ram 16 gb ddr4'}
]
search_dict = {}
for entry in search_array:
key = [*entry][0]
value = entry[key]
if key in [*search_dict]:
search_dict[key].append(value)
else:
search_dict[key] = [value]
filename = 'product1.xlsx'
products_sheet = pd.read_excel(filename, header=None, index_col=False)
#word_set = ', '.join([x[0] for x in products_sheet.values.tolist()])
#MIN_MATCH_SCORE = 30
products_list = [x[0] for x in products_sheet.values.tolist()]
# Column #1
result_data = {}
result_data[filename.replace('.xlsx','')] = products_list
# Initialize columns #2-#n and populate it with placeholder values
columns = [*search_dict]
for column in columns:
result_data[column]=list(range(products_list.__len__()))
for row_no, row in enumerate(products_list):
for column in columns:
matched_products_list=[]
for product in search_dict[column]:
print(f'Comparing {row} to {product} is:', end='')
cm = CustomMatcher()
matching_score = cm.match(row, product)
if matching_score>50:
#if fuzz.token_set_ratio(row, product)>25:
print(matching_score, ' accepted')
matched_products_list.append(product)
else:
print (matching_score, ' rejected')
if (matched_products_list != []):
result_data[column][row_no] = matched_products_list
else:
result_data[column][row_no] = 'NAN'
result_df = pd.DataFrame(data=result_data)
print(result_df)
Notes:
I have created a CustomMatcher
instead of using this fuzzywuzzy
thing that is acting too crazy to have a meaningful threshold level to filter on. CustomMatcher
is word-based in calculating the score, but letter-based on the comparison. It isolates numbers as words to be matched to after padding them with spaces. Those 50+ lines are easily accessible through the function CustomMatcher.match(word1, word2)
I used matching_score>50
as a reasonable sensitivity threshold for your application to match.
You do not need to define a join between the entries in a single cell, instead, I used lists that ease access to every individual unit.
The output is packed as a pandas data frame.
Thanks,
与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…