本文整理汇总了Python中sklearn.neighbors.LSHForest类的典型用法代码示例。如果您正苦于以下问题:Python LSHForest类的具体用法?Python LSHForest怎么用?Python LSHForest使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了LSHForest类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: test_fit
def test_fit():
"""Checks whether `fit` method sets all attribute values correctly."""
n_samples = 12
n_features = 2
n_estimators = 5
rng = np.random.RandomState(42)
X = rng.rand(n_samples, n_features)
lshf = LSHForest(n_estimators=n_estimators)
lshf.fit(X)
# _input_array = X
assert_array_equal(X, lshf._fit_X)
# A hash function g(p) for each tree
assert_equal(n_estimators, len(lshf.hash_functions_))
# Hash length = 32
assert_equal(32, lshf.hash_functions_[0].components_.shape[0])
# Number of trees_ in the forest
assert_equal(n_estimators, len(lshf.trees_))
# Each tree has entries for every data point
assert_equal(n_samples, len(lshf.trees_[0]))
# Original indices after sorting the hashes
assert_equal(n_estimators, len(lshf.original_indices_))
# Each set of original indices in a tree has entries for every data point
assert_equal(n_samples, len(lshf.original_indices_[0]))
开发者ID:cnspica,项目名称:scikit-learn,代码行数:25,代码来源:test_approximate.py
示例2: single_batch
def single_batch(self, tweets):
"""Performs an approximate nearest neighbors search on tweets in the database
passed to it. The database must be a list of tweets (text of the tweets only).
Returns the indices of tweets with nearby neighbors (i.e. spam tweets).
These indices correspond to indices within the batch of tweets fed to
this function."""
# Vectorize and fit tree:
vect2 = CountVectorizer(stop_words = self.custom_stop_words)
X2 = vect2.fit_transform(tweets)
tree2 = LSHForest()
tree2.fit(X2)
# Build tree:
n_neighbors = []
neighbors_indices = []
working_batch_size = len(tweets)
for x in vect2.transform(tweets):
if len(n_neighbors) % 100 == 0: print "%r tweets analyzed out of %r for this batch" % (len(n_neighbors), working_batch_size)
# Only deal with tweets that are longer than 3 words.
neighbors = tree2.radius_neighbors(x, radius = self.sensitivity)[1]
if x.getnnz() > 2:
n_neighbors.append(len(neighbors[0]))
neighbors_indices.append(neighbors)
else:
n_neighbors.append(1)
neighbors_indices.append(np.array([np.array([0])]))
neighbors_indices = [x for x in range(len(neighbors_indices)) if len(neighbors_indices[x][0]) > 2]
return neighbors_indices
开发者ID:ilyaaltshteyn,项目名称:danger_tweets,代码行数:32,代码来源:tweetPreprocessor.py
示例3: __init__
class EmbeddingNetworkBuilder:
""" Basically a wrapper around sklearns LSH forest """
def __init__(self, lsh_init=None):
if lsh_init == None:
self._lsh_forest = LSHForest(n_estimators=25, n_candidates=1000)
else:
self._lsh_forest = lsh_init
self.iw = None
self.m = None
def fit_lsh_forest(self, embedding):
self._lsh_forest.fit(embedding.m)
self._embedding = embedding
def extract_nn_network(self, nn=20):
dir_graph_mat = self._lsh_forest.kneighbors_graph(X=self._embedding.m, n_neighbors=nn+1)
return dir_graph_mat
def make_undirected(self, dir_graph_mat):
nodes = set(range(dir_graph_mat.shape[0]))
edges = set([])
for node_i in dir_graph_mat.shape[0]:
for node_j in dir_graph_mat[node_i].nonzero()[1]:
edges.add((node_i, node_j))
return nodes, edges
def get_forest(self):
return self._lsh_forest
def get_node_to_word(self):
return self.iw
开发者ID:viveksck,项目名称:langchange,代码行数:32,代码来源:networkinducer.py
示例4: single_batch
def single_batch(self, tweets):
"""Performs an approximate nearest neighbors search on tweets in the database
passed to it. The database must be a list of tweets (text of the tweets only).
Returns the indices of tweets with nearby neighbors (i.e. spam tweets).
These indices correspond to indices within the batch of tweets fed to
this function."""
# Vectorize and fit tree:
vect2 = CountVectorizer(stop_words = self.common_twitter_handles)
X2 = vect2.fit_transform(tweets)
tree2 = LSHForest()
tree2.fit(X2)
# Build tree:
n_neighbors = []
neighbors_indices = []
for x in vect2.transform(tweets):
if len(n_neighbors) % 100 == 0: print "%r tweets analyzed out of %r for this batch" % (len(n_neighbors), self.batch_size)
neighbors = tree2.radius_neighbors(x, radius = .4)[1]
n_neighbors.append(len(neighbors[0]))
neighbors_indices.append(neighbors)
neighbors_indices = [x for x in range(len(neighbors_indices)) if len(neighbors_indices[x][0]) > 2]
return neighbors_indices
开发者ID:ilyaaltshteyn,项目名称:tweet_pre_processor,代码行数:26,代码来源:tweet_processor_bigdist.py
示例5: get_nearest_neighbor_iterable
def get_nearest_neighbor_iterable(self, graphlist, start_graphs, start_is_subset=True):
# vectorize all
graphlist= list(graphlist)
graphlist_ = copy.deepcopy(graphlist)
X = self.vectorizer.transform_single(graphlist_)
start_graphs= list(start_graphs)
graphlist_= copy.deepcopy(start_graphs)
Y = self.vectorizer.transform_single(graphlist_)
forest = LSHForest()
forest.fit(X)
#http://scikit-learn.org/stable/modules/neighbors.html
distances, indices = forest.kneighbors(Y, n_neighbors=2)
# we just assume that this is short...
index = 0
if start_is_subset:
index += 1
#matches= ( X_index ,Y_index, distance )
matches = [(indices[i, index], i, distances[i, index]) for i in range(len(indices))]
matches.sort()
# this looks super confusing....
#for index, graph in enumerate(selection_iterator(graphlist, [a[0] for a in matches])):
# yield ((graph, start_graphs[matches[index][1]], X[matches[index][0]]))
# so i wrote this:,,, you may even get rid of the matches variable i think.. and use indices directly
for Xi,Yi,dist in matches:
yield ((start_graphs[Yi],graphlist[Xi],X[Xi]))
开发者ID:antworteffekt,项目名称:GraphLearn,代码行数:33,代码来源:directedsampler.py
示例6: test_hash_functions
def test_hash_functions():
"""Checks randomness of hash functions.
Variance and mean of each hash function (projection vector)
should be different from flattened array of hash functions.
If hash functions are not randomly built (seeded with
same value), variances and means of all functions are equal.
"""
n_samples = 12
n_features = 2
n_estimators = 5
rng = np.random.RandomState(42)
X = rng.rand(n_samples, n_features)
lshf = LSHForest(n_estimators=n_estimators,
random_state=rng.randint(0, np.iinfo(np.int32).max))
lshf.fit(X)
hash_functions = []
for i in range(n_estimators):
hash_functions.append(lshf.hash_functions_[i].components_)
for i in range(n_estimators):
assert_not_equal(np.var(hash_functions),
np.var(lshf.hash_functions_[i].components_))
for i in range(n_estimators):
assert_not_equal(np.mean(hash_functions),
np.mean(lshf.hash_functions_[i].components_))
开发者ID:cnspica,项目名称:scikit-learn,代码行数:29,代码来源:test_approximate.py
示例7: search_neighbors
def search_neighbors(request):
designs = Design.objects.all()
image_list = []
for design in designs:
image_list.append(str(design.uid) + ".png")
d_geometry = settings.D_GEOMETRY
designed_images = np.empty((len(image_list), d_geometry[0]*d_geometry[1]*3), dtype="float32")
for i in range(len(image_list)):
designed_images[i] = img2numpy_arr(settings.DESIGN_PATH + image_list[i]).reshape(d_geometry[0]*d_geometry[1]*3)
designed_images /= 255
lshf = LSHForest(random_state=42)
lshf.fit(designed_images)
num = int(request.GET['num'])
input_fname = str(request.GET['input'])
input_image = img2numpy_arr(settings.DESIGN_PATH + input_fname)
input_image = input_image.reshape(1, -1)/255
_, indices = lshf.kneighbors(input_image, n_neighbors=num)
similar_images = []
for i in list(indices.reshape(-1)):
similar_images.append({
"image": str(designs[i].uid) + ".png",
"text": str(designs[i].history_text),
"like": int(designs[i].like),
"filtered": str(designs[i].filtered)
})
return JsonResponse({
"results": similar_images
})
开发者ID:Soma2-HighFashion,项目名称:Design_Studio,代码行数:34,代码来源:views.py
示例8: build_index
def build_index(data, n_estimators=20, n_candidates=100, n_neighbors=10, seed=0):
lshf = LSHForest(n_estimators=n_estimators, n_candidates=n_candidates,
n_neighbors=n_neighbors, random_state=seed)
t0 = time()
lshf.fit(data)
duration = time() - t0
return lshf, duration
开发者ID:ogrisel,项目名称:lsh_glove,代码行数:7,代码来源:index_embedding.py
示例9: get_heap_and_forest
def get_heap_and_forest(self, griter, k):
'''
so we create the heap and the forest...
heap is (dist to hyperplane, count, graph)
and the forest ist just a nearest neighbor from sklearn
'''
graphs = list(griter)
graphs2 = copy.deepcopy(graphs)
# transform doess mess up the graph objects
X = self.vectorizer.transform(graphs)
forest = LSHForest()
forest.fit(X)
print 'got forest'
heap = []
for vector, graph in zip(X, graphs2):
graph2 = nx.Graph(graph)
heapq.heappush(heap, (
self.sampler.estimator.predict_proba(self.sampler.vectorizer.transform_single(graph2))[0][1],
# score ~ dist from hyperplane
k + 1, # making sure that the counter is high so we dont output the startgraphz at the end
graph)) # at last the actual graph
print 'got heap'
distances, unused = forest.kneighbors(X, n_neighbors=2)
distances = [a[1] for a in distances] # the second element should be the dist we want
avg_dist = distances[len(distances) / 2] # sum(distances)/len(distances)
print 'got dist'
return heap, forest, avg_dist
开发者ID:smautner,项目名称:GraphLearn,代码行数:32,代码来源:discsampler.py
示例10: test_neighbors_accuracy_with_n_estimators
def test_neighbors_accuracy_with_n_estimators():
# Checks whether accuracy increases as `n_estimators` increases.
n_estimators = np.array([1, 10, 100])
n_samples = 100
n_features = 10
n_iter = 10
n_points = 5
rng = np.random.RandomState(42)
accuracies = np.zeros(n_estimators.shape[0], dtype=float)
X = rng.rand(n_samples, n_features)
for i, t in enumerate(n_estimators):
lshf = LSHForest(n_candidates=500, n_estimators=t)
ignore_warnings(lshf.fit)(X)
for j in range(n_iter):
query = X[rng.randint(0, n_samples)].reshape(1, -1)
neighbors = lshf.kneighbors(query, n_neighbors=n_points,
return_distance=False)
distances = pairwise_distances(query, X, metric='cosine')
ranks = np.argsort(distances)[0, :n_points]
intersection = np.intersect1d(ranks, neighbors).shape[0]
ratio = intersection / float(n_points)
accuracies[i] = accuracies[i] + ratio
accuracies[i] = accuracies[i] / float(n_iter)
# Sorted accuracies should be equal to original accuracies
assert_true(np.all(np.diff(accuracies) >= 0),
msg="Accuracies are not non-decreasing.")
# Highest accuracy should be strictly greater than the lowest
assert_true(np.ptp(accuracies) > 0,
msg="Highest accuracy is not strictly greater than lowest.")
开发者ID:AlexandreAbraham,项目名称:scikit-learn,代码行数:32,代码来源:test_approximate.py
示例11: text_hist
def text_hist():
"""
Calculate histogram of text of images
"""
with open('data/sift_names.pkl', 'r') as f:
names = cPickle.load(f)
with open('data/sift_hist.pkl', 'r') as f:
sift_hists = cPickle.load(f)
filenames = []
for name in names:
name = name.replace('img', 'descr')
name = name.replace('.jpg', '.txt')
filenames.append('shopping/images/' + name)
vectorizer = CountVectorizer(input='filename', token_pattern="(?u)"+'\w+', ngram_range=(1, 1), min_df=2)
xall_transformed = vectorizer.fit_transform(filenames).tocsr()
preprocessing.normalize(xall_transformed, copy=False)
lamb = .5
hists = scipy.sparse.hstack([xall_transformed * lamb, sift_hists * (1-lamb)]).toarray()
preprocessing.normalize(hists, copy=False)
model = LSHForest()
model.fit(hists)
with open('data/text_hist.pkl', 'w') as f:
cPickle.dump(xall_transformed, f)
with open('data/vectorizer.pkl', 'w') as f:
cPickle.dump(vectorizer, f)
with open('data/lshforest_combine.pkl', 'w') as f:
cPickle.dump(model, f)
开发者ID:bangnk,项目名称:tu_anh,代码行数:28,代码来源:tiny.py
示例12: test_distances
def test_distances():
"""Checks whether returned neighbors are from closest to farthest."""
n_samples = 12
n_features = 2
n_iter = 10
rng = np.random.RandomState(42)
X = rng.rand(n_samples, n_features)
lshf = LSHForest()
lshf.fit(X)
for i in range(n_iter):
n_neighbors = rng.randint(0, n_samples)
query = X[rng.randint(0, n_samples)]
distances, neighbors = lshf.kneighbors(query,
n_neighbors=n_neighbors,
return_distance=True)
# Returned neighbors should be from closest to farthest.
assert_true(np.all(np.diff(distances[0]) >= 0))
mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
distances, neighbors = lshf.radius_neighbors(query,
radius=mean_dist,
return_distance=True)
assert_true(np.all(np.diff(distances[0]) >= 0))
开发者ID:CC-Fu-CC,项目名称:scikit-learn,代码行数:25,代码来源:test_approximate.py
示例13: create_tree
def create_tree(self,listNames,variableName):
#LSHForest. only once for the main database
lshf = LSHForest(n_estimators=50,n_candidates=500)
TF, tfidfs = self.create_TDIDF(self.tokenize(listNames))
lshf.fit(tfidfs)
pickle.dump(lshf,open("{0}/{1}_lshf.dump".format(self.folderSaveData,variableName),"wb+"))
pickle.dump(listNames,open("{0}/{1}_listNames.dump".format(self.folderSaveData,variableName),"wb+"))
pickle.dump(TF,open("{0}/{1}_TF.dump".format(self.folderSaveData,variableName),"wb+"))
开发者ID:uvacorpnet,项目名称:name_matching,代码行数:8,代码来源:matcher.py
示例14: test_radius_neighbors_boundary_handling
def test_radius_neighbors_boundary_handling():
X = [[0.999, 0.001], [0.5, 0.5], [0, 1.], [-1., 0.001]]
n_points = len(X)
# Build an exact nearest neighbors model as reference model to ensure
# consistency between exact and approximate methods
nnbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X)
# Build a LSHForest model with hyperparameter values that always guarantee
# exact results on this toy dataset.
lsfh = LSHForest(min_hash_match=0, n_candidates=n_points,
random_state=42).fit(X)
# define a query aligned with the first axis
query = [[1., 0.]]
# Compute the exact cosine distances of the query to the four points of
# the dataset
dists = pairwise_distances(query, X, metric='cosine').ravel()
# The first point is almost aligned with the query (very small angle),
# the cosine distance should therefore be almost null:
assert_almost_equal(dists[0], 0, decimal=5)
# The second point form an angle of 45 degrees to the query vector
assert_almost_equal(dists[1], 1 - np.cos(np.pi / 4))
# The third point is orthogonal from the query vector hence at a distance
# exactly one:
assert_almost_equal(dists[2], 1)
# The last point is almost colinear but with opposite sign to the query
# therefore it has a cosine 'distance' very close to the maximum possible
# value of 2.
assert_almost_equal(dists[3], 2, decimal=5)
# If we query with a radius of one, all the samples except the last sample
# should be included in the results. This means that the third sample
# is lying on the boundary of the radius query:
exact_dists, exact_idx = nnbrs.radius_neighbors(query, radius=1)
approx_dists, approx_idx = lsfh.radius_neighbors(query, radius=1)
assert_array_equal(np.sort(exact_idx[0]), [0, 1, 2])
assert_array_equal(np.sort(approx_idx[0]), [0, 1, 2])
assert_array_almost_equal(np.sort(exact_dists[0]), dists[:-1])
assert_array_almost_equal(np.sort(approx_dists[0]), dists[:-1])
# If we perform the same query with a slightly lower radius, the third
# point of the dataset that lay on the boundary of the previous query
# is now rejected:
eps = np.finfo(np.float64).eps
exact_dists, exact_idx = nnbrs.radius_neighbors(query, radius=1 - eps)
approx_dists, approx_idx = lsfh.radius_neighbors(query, radius=1 - eps)
assert_array_equal(np.sort(exact_idx[0]), [0, 1])
assert_array_equal(np.sort(approx_idx[0]), [0, 1])
assert_array_almost_equal(np.sort(exact_dists[0]), dists[:-2])
assert_array_almost_equal(np.sort(approx_dists[0]), dists[:-2])
开发者ID:AlexandreAbraham,项目名称:scikit-learn,代码行数:58,代码来源:test_approximate.py
示例15: __init__
class LHSForestEngine:
def __init__(self):
self.engine = LSHForest(random_state=42)
self.name = "LHS"
def fit(self, data):
self.engine.fit(data)
def dist(self, data):
distances, indices = self.engine.kneighbors(data, n_neighbors=1)
return distances.ravel()
开发者ID:enoonIT,项目名称:nbnn-nbnl,代码行数:12,代码来源:nbnn.py
示例16: calculate_duplication_number
def calculate_duplication_number(self,text_list):
print "length is ", len(text_list)
tf_vectorizer = CountVectorizer(stop_words=None,analyzer='word',ngram_range=(5,5))
#print text_list
tf = tf_vectorizer.fit_transform(text_list)
#print tf_vectorizer.get_feature_names()
print tf[0]
#print tf[123]
lshf = LSHForest()
#print tf
lshf.fit(tf)
distance,index = lshf.kneighbors(tf,n_neighbors=1)
print distance, index
开发者ID:rivercold,项目名称:webStructure,代码行数:13,代码来源:irobot_crawl.py
示例17: startQuery
def startQuery():
while True:
try:
ipt = raw_input('Directory of query:')
except ImportError:
print 'invalid type'
else:
query = ipt
if query == 'exit()':
break
print 'loading query...'
try:
token = get_tokens_by_dir(query)
except IOError:
print 'invalid file name'
else:
##########################################query preprocessing
print 'query pre-processing...'
stopped_tokens = [i for i in token if not i in en_stop]
p_stemmer = PorterStemmer()
stemed_tokens = []
for i in stopped_tokens:
try:
temp_token = str(p_stemmer.stem(i))
stemed_tokens.append(temp_token)
except IndexError:
pass
tokens = [stemed_tokens]
######################################################################################
dictionary_new = corpora.Dictionary(tokens)
corpus_new = [dictionary_new.doc2bow(text) for text in tokens]
QUERY_TOPIC = np.zeros([1,num_topic]) ## topic vector for query
new_topics = LDA[corpus_new]
for i in new_topics[0]:
print(i)
QUERY_TOPIC[0,i[0]] = i[1] ##assign new topics to query doc-topic matrix
print 'fetching results for you...'
lshf = LSHForest(random_state=42)
lshf.fit(DOC_TOPICS) ##fit the local sensitive hash forest with training data POINT_SET
dist,indices=lshf.kneighbors(QUERY_TOPIC,n_neighbors=20)
print indices
开发者ID:wylswz,项目名称:FYPLinux,代码行数:51,代码来源:lda_read_model.py
示例18: fit_model
def fit_model(self, model_type='brute', params=None):
'''
fits model operating under the assumption that there's a model already built
'''
if model_type == 'brute':
self.model = NearestNeighbors(algorithm='brute', **params)
elif model_type == 'lsh':
self.model = LSHForest( **params)
# elif model_type == 'annoy':
# self.model = Annoy(**params)
self.model.fit(self.vector_space)
print self.model
开发者ID:asharma567,项目名称:chat_bot_w_RNN,代码行数:14,代码来源:deduper_class.py
示例19: fit_lshf
def fit_lshf(data):
logger.info('Fitting LSHForest...')
from sklearn.neighbors import LSHForest
lshf = LSHForest(
n_estimators=20,
min_hash_match=4,
n_candidates=200,
n_neighbors=2,
radius=1.0,
radius_cutoff_ratio=0.9,
random_state=None,
)
lshf.fit(data)
return lshf
开发者ID:Curly-Mo,项目名称:sample-recognition,代码行数:14,代码来源:ann.py
示例20: __init__
def __init__(self):
self.unknown = ''
self.same_person_num = 1
self.has_cal_dist = []
self.NeighbourNum = 10
# 如果管理员加载图片, 把图片放到all_pic_data_folder下指定人的目录(图片文件和特征文件的文件名相同)
self.all_pic_feature_data_folder = '/data/liubo/face/research_feature_self' # 研究院的模型直接存储特征
# 保存图片可以方便以后查看效果, 方便前端显示, 也方便管理员进行标注
self.all_pic_data_folder = '/data/liubo/face/research_self'
if not os.path.exists(self.all_pic_data_folder):
os.makedirs(self.all_pic_data_folder)
if not os.path.exists(self.all_pic_feature_data_folder):
os.makedirs(self.all_pic_feature_data_folder)
self.n_neighbors = 10
self.lshf = LSHForest(n_estimators=20, n_candidates=200, n_neighbors=self.n_neighbors)
self.all_labels = []
self.all_pic_feature = []
self.same_pic_id = 2
self.must_be_same_id = 1
self.must_be_not_same_id = 0
self.maybe_same_id = 3
self.new_person_str = 'new_person_'
self.current_new_person_id = self.find_current_new_person_id()
self.must_same_str = '_Must_Same'
self.maybe_same_str = '_Maybe_same'
self.load_time = time.time()
self.user_count = {}
self.upper_threshold = upper_verif_threshold
self.lower_threshold = lower_verif_threshold
self.same_pic_threshold = same_pic_threshold
self.trans_dic = {self.same_pic_id: 'same_pic', self.must_be_same_id: 'must_same_id',
self.must_be_not_same_id: 'must_not_same_id', self.maybe_same_id: 'maybe_same_id'}
self.nearest = deque(maxlen=nearest_num)
self.verification_same_person = 0
开发者ID:ustbliubo2014,项目名称:FaceRecognition,代码行数:34,代码来源:recognize_server_research.py
注:本文中的sklearn.neighbors.LSHForest类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论