• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python utils.resample函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中sklearn.utils.resample函数的典型用法代码示例。如果您正苦于以下问题:Python resample函数的具体用法?Python resample怎么用?Python resample使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了resample函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: initialize

    def initialize(self, X, k, random_seed, method='naive'):
        if method == 'naive':
            # Randomly pick k data points to be the centroids of the k clusters
            centroids = resample(X, n_samples=k, random_state=random_seed, replace=False)
        elif method == 'kmeans++': # https://en.wikipedia.org/wiki/K-means%2B%2B
            # Step 1: Choose one center uniformly at random from among the data points
            centroids = resample(X, n_samples=1, random_state=random_seed, replace=False)
            N = len(X)
            # Sampling the 1~k centroids
            for i in range(1, k):
                distances = [ -1 ] * N
                # Step 2: For each data point x, compute D(x)
                for j in range(N):
                    # The distance between x and the nearest center that has already been chosen
                    distances[j] = min(np.linalg.norm(X[j] - centroid) for centroid in centroids)

                # Step 3: Choose one new data point at randome as a new center,
                # using a weighted probability distribution where a point x is chosen with probability proportional to D(x)^2
                square_distances = [ distance ** 2 for distance in distances ]
                total_square_distance = sum(square_distances)
                # Naturally excluded already selected data points, because their probability is 0
                probabilities = [ square_distance / total_square_distance for square_distance in square_distances ]

                new_centroid_index = np.random.choice(range(N), size=1, replace=False, p=probabilities)[0]

                centroids = np.append(centroids, [ X[new_centroid_index] ], axis=0)

        return centroids
开发者ID:bluesilence,项目名称:python,代码行数:28,代码来源:KMeans.py


示例2: test_resample

def test_resample():
    # Border case not worth mentioning in doctests
    assert resample() is None

    # Check that invalid arguments yield ValueError
    assert_raises(ValueError, resample, [0], [0, 1])
    assert_raises(ValueError, resample, [0, 1], [0, 1],
                  replace=False, n_samples=3)
    assert_raises(ValueError, resample, [0, 1], [0, 1], meaning_of_life=42)
    # Issue:6581, n_samples can be more when replace is True (default).
    assert_equal(len(resample([1, 2], n_samples=5)), 5)
开发者ID:allefpablo,项目名称:scikit-learn,代码行数:11,代码来源:test_utils.py


示例3: run_scikit_digits

def run_scikit_digits(epochs=0, layers=0, neuron_count=0):
    """ Run Handwritten Digits dataset from Scikit-Learn.  Learning set is split
    into 70% for training, 15% for testing, and 15% for validation.

    Parameters
    ----------
    epochs : int
        Number of iterations of the the traininng loop for the whole dataset
    layers : int
        Number of layers (not counting the input layer, but does count output
        layer)
    neuron_count : list
        The number of neurons in each of the layers (in order), does not count
        the bias term

    Attributes
    ----------
    target_values : list
        The possible values for each training vector

    """

    # Imported from linear_neuron
    temp_digits = datasets.load_digits()
    digits = utils.resample(temp_digits.data, random_state=3)
    temp_answers = utils.resample(temp_digits.target, random_state=3)
    # images = utils.resample(temp_digits.images, random_state=0)
    num_of_training_vectors = 1250
    answers, answers_to_test, validation_answers = (
        temp_answers[:num_of_training_vectors],
        temp_answers[num_of_training_vectors : num_of_training_vectors + 260],
        temp_answers[num_of_training_vectors + 260 :],
    )
    training_set, testing_set, validation_set = (
        digits[:num_of_training_vectors],
        digits[num_of_training_vectors : num_of_training_vectors + 260],
        digits[num_of_training_vectors + 260 :],
    )

    ###########
    # network.visualization(training_set[10], answers[10])
    # network.visualization(training_set[11], answers[11])
    # network.visualization(training_set[12], answers[12])

    network = Network(layers, neuron_count, training_set[0])
    network.train(training_set, answers, epochs)
    f = open("my_net.pickle", "wb")
    # fr = open('my_net.pickle', 'rb')
    dill.dump(network, f)
    # network = pickle.load(fr)
    # fr.close()
    f.close()
    # guess_list = network.run_unseen(testing_set)
    return network.run_unseen(testing_set)
开发者ID:totalgood,项目名称:capstone,代码行数:54,代码来源:net_launch.py


示例4: resample_training_dataset

    def resample_training_dataset(self, labels, feature_array, sizes = (5000,500)):
        """
        Inputs:
            - labels
            - features
            - sizes: tuple, for each class (0,1,etc)m the number of training chunks you want.
            i.e for 500 seizures, 5000 baseline, sizes = (5000, 500), as 0 is baseline, 1 is Seizure
        Takes labels and features an

        WARNING: Up-sampling target class prevents random forest oob from being accurate.
        """
        if len (labels.shape) == 1:
            labels = labels[:, None]

        resampled_labels = []
        resampled_features = []
        for i,label in enumerate(np.unique(labels.astype('int'))):
            class_inds = np.where(labels==label)[0]

            class_labels = labels[class_inds]
            class_features = feature_array[class_inds,:]

            if class_features.shape[0] < sizes[i]: # need to oversample
                class_features_duplicated = np.vstack([class_features for i in range(int(sizes[i]/class_features.shape[0]))])
                class_labels_duplicated  = np.vstack([class_labels for i in range(int(sizes[i]/class_labels.shape[0]))])
                n_extra_needed = sizes[i] - class_labels_duplicated.shape[0]
                extra_features = resample(class_features, n_samples =  n_extra_needed,random_state = 7, replace = False)
                extra_labels = resample(class_labels, n_samples =  n_extra_needed,random_state = 7, replace = False)

                boot_array  = np.vstack([class_features_duplicated,extra_features])
                boot_labels = np.vstack([class_labels_duplicated,extra_labels])

            elif class_features.shape[0] > sizes[i]: # need to undersample
                boot_array  = resample(class_features, n_samples =  sizes[i],random_state = 7, replace = False)
                boot_labels = resample(class_labels,   n_samples =  sizes[i],random_state = 7, replace = False)

            elif class_features.shape[0] == sizes[i]:
                logging.debug('label '+str(label)+ ' had exact n as sample, doing nothing!')
                boot_array  = class_features
                boot_labels = class_labels
            else:
                print(class_features.shape[0], sizes[i])
                print ('fuckup')
            resampled_features.append(boot_array)
            resampled_labels.append(boot_labels)
        # stack both up...
        resampled_labels = np.vstack(resampled_labels)
        resampled_features = np.vstack(resampled_features)

        logging.debug('Original label counts: '+str(pd.Series(labels[:,0]).value_counts()))
        logging.debug('Resampled label counts: '+str(pd.Series(resampled_labels[:,0]).value_counts()))

        return resampled_labels, resampled_features
开发者ID:jcornford,项目名称:pyecog,代码行数:53,代码来源:classifier.py


示例5: run_mnist

def run_mnist(epochs, layers, neuron_count):
    """ Run Mnist dataset and output a guess list on the Kaggle test_set

    Parameters
    ----------
    epochs : int
        Number of iterations of the the traininng loop for the whole dataset
    layers : int
        Number of layers (not counting the input layer, but does count output
        layer)
    neuron_count : list
        The number of neurons in each of the layers (in order), does not count
        the bias term

    Attributes
    ----------
    target_values : list
        The possible values for each training vector

    """

    with open('train.csv', 'r') as f:
        reader = csv.reader(f)
        t = list(reader)
        train = [[int(x) for x in y] for y in t[1:]]

    with open('test.csv', 'r') as f:
        reader = csv.reader(f)
        raw_nums = list(reader)
        test_set = [[int(x) for x in y] for y in raw_nums[1:]]

    ans_train = [x[0] for x in train]
    train_set = [x[1:] for x in train]
    ans_train.pop(0)
    train_set.pop(0)

    train_set = utils.resample(train_set, random_state=2)
    ans_train = utils.resample(ans_train, random_state=2)

    network = Network(layers, neuron_count, train_set[0])
    network.train(train_set, ans_train, epochs)

    # For validation purposes
    # guess_list = network.run_unseen(train_set[4000:4500])
    # network.report_results(guess_list, ans_train[4000:4500])
    # guess_list = network.run_unseen(train_set[4500:5000])
    # network.report_results(guess_list, ans_train[4500:5000])

    guess_list = network.run_unseen(test_set)
    with open('digits.txt', 'w') as d:
        for elem in guess_list:
            d.write(str(elem)+'\n')
开发者ID:uglyboxer,项目名称:finnegan,代码行数:52,代码来源:net_launch.py


示例6: test_resample_stratified

def test_resample_stratified():
    # Make sure resample can stratify
    rng = np.random.RandomState(0)
    n_samples = 100
    p = .9
    X = rng.normal(size=(n_samples, 1))
    y = rng.binomial(1, p, size=n_samples)

    _, y_not_stratified = resample(X, y, n_samples=10, random_state=0,
                                   stratify=None)
    assert np.all(y_not_stratified == 1)

    _, y_stratified = resample(X, y, n_samples=10, random_state=0, stratify=y)
    assert not np.all(y_stratified == 1)
    assert np.sum(y_stratified) == 9  # all 1s, one 0
开发者ID:daniel-perry,项目名称:scikit-learn,代码行数:15,代码来源:test_utils.py


示例7: eval_prox_random

    def eval_prox_random(self, n_sample_node=5, sample_nodes=[]):
        cs = self.cs
        measurements = {}
        nodes = cs.nodes()

        test_nodes = []
        if len(sample_nodes):
            if type(sample_nodes[0]) is str:
                test_nodes = sample_nodes
            elif type(sample_nodes[0]) is int:
                test_nodes = [nodes[i] for i in sample_nodes]
        else:
            test_nodes = resample(nodes, n_samples=n_sample_node)

        # nae of coordinate-based proximity vs ground-proximity
        coor_test = self.coor_all[test_nodes]

        ground_prox = (
            cs.proximity_to(sources=test_nodes, dests=cs.nodes()).as_matrix().transpose()
        )  # shape: test_nodes x all_nodes
        coor_prox = np.dot(coor_test.as_matrix().transpose(), self.coor_all.as_matrix())

        nae = pd.Series.combine(
            pd.Series(coor_prox.flatten()), pd.Series(ground_prox.flatten()), lambda c, g: abs(c - g) / g
        )
        nae_plot = pd.Series(np.linspace(0.0, 1.0, num=len(nae)), index=nae.order())
        measurements["nae"] = nae
        measurements["nae_plot"] = nae_plot

        return measurements
开发者ID:blublud,项目名称:coordinate_learning,代码行数:30,代码来源:path_accum_coorsys.py


示例8: bootstrap_auc

def bootstrap_auc(df, col, pred_col, n_bootstrap=1000):
    """
    Calculate the boostrapped AUC for a given col trying to predict a pred_col.

    Parameters
    ----------
    df : pandas.DataFrame
    col : str
        column to retrieve the values from
    pred_col : str
        the column we're trying to predict
    n_boostrap : int
        the number of bootstrap samples

    Returns
    -------
    list : AUCs for each sampling
    """
    scores = np.zeros(n_bootstrap)
    old_len = len(df)
    df.dropna(subset=[col], inplace=True)
    new_len = len(df)
    if new_len < old_len:
        logger.info("Dropping NaN values in %s to go from %d to %d rows" % (col, old_len, new_len))
    preds = df[pred_col].astype(int)
    for i in range(n_bootstrap):
        sampled_counts, sampled_pred = resample(df[col], preds)
        if is_single_class(sampled_pred, col=pred_col):
            continue
        scores[i] = roc_auc_score(sampled_pred, sampled_counts)
    return scores
开发者ID:hammerlab,项目名称:cohorts,代码行数:31,代码来源:model.py


示例9: fit

 def fit(self, dataSet):
     for clt in self.forest:
         randSet= resample(dataSet)
         #print "randSet size = %d" % len(randSet)
         target = [x[0] for x in randSet]
         train = [x[1:] for x in randSet]
         clt.fit(train, target)
开发者ID:agag4510118,项目名称:CS412-Introduction-to-Data-Mining,代码行数:7,代码来源:RandomForest.py


示例10: boot_estimates

def boot_estimates(model, X, y, nboot):
    '''
    Evaluate coefficient estimates for nboot boostrap samples
    '''
    coefs = [np.hstack([model.fit(iX, iy).intercept_, model.fit(iX, iy).coef_.ravel()]) 
            for iX, iy in (resample(X, y) for i in xrange(nboot))]  
    return np.vstack(coefs)
开发者ID:thomasbrawner,项目名称:python_tools,代码行数:7,代码来源:marginal_effects_example.py


示例11: downsample

def downsample(y, sizes = [30000, 3000]):
#     classes = Counter(y)
    res = []
    for class_i, sz in enumerate(sizes):
        indices = [x for x in y == class_i if x]
        res.append(resample(indices, replace = True, n_samples = sz))
    return tuple(res)
开发者ID:vadimnazarov,项目名称:llama,代码行数:7,代码来源:llama.py


示例12: run_method_usage

def run_method_usage(methods,cases):
    methods = [m[0] for m in methods]
    # Bootstrap the percentage error bars:
    percents =[]
    for i in range(10000):
        nc = resample(cases)
        percents.append(100*np.sum(nc,axis=0)/len(nc))
    percents=np.array(percents)
    mean_percents = np.mean(percents,axis=0)
    std_percents = np.std(percents,axis=0)*1.96
    inds=np.argsort(mean_percents).tolist()
    inds.reverse()
    avg_usage = np.mean(mean_percents)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    x=np.arange(len(methods))
    ax.plot(x,[avg_usage]*len(methods),'-',color='0.25',lw=1,alpha=0.2)
    ax.bar(x, mean_percents[inds], 0.6, color=paired[0],linewidth=0,
           yerr=std_percents[inds],ecolor=paired[1])
    #ax.set_title('Method Occurrence')
    ax.set_ylabel('Occurrence %',fontsize=30)
    ax.set_xlabel('Method',fontsize=30)
    ax.set_xticks(np.arange(len(methods)))
    ax.set_xticklabels(np.array(methods)[inds],fontsize=8)
    fig.autofmt_xdate()
    fix_axes()
    plt.tight_layout()
    fig.savefig(figure_path+'method_occurrence.pdf', bbox_inches=0)
    fig.show()
    return inds,mean_percents[inds]
开发者ID:IDEALLab,项目名称:design_method_recommendation_JMD_2014,代码行数:30,代码来源:paper_experiments.py


示例13: balanced_resample

def balanced_resample(data, labels):
    """Do a balanced resampling of data and labels, returning them
    See the test routine at the bottom for an example of behavior
    """
    most_common, num_required = mstats.mode(labels)
    possible_labels = np.unique(labels)

    data_resampled = []
    labels_resampled = []

    for possible_label in possible_labels:
        in_this_label = labels == possible_label

        data_buffered = np.array([])
        data_buffered = np.reshape(data_buffered, (0, data.shape[1]))
        labels_buffered = np.array([])

        while len(data_buffered) < num_required:
            data_buffered = np.vstack([data_buffered, data[in_this_label]])
            labels_buffered = np.hstack([labels_buffered, labels[in_this_label]])

        single_data_resampled, single_labels_resampled = utils.resample(
            data_buffered,
            labels_buffered,
            n_samples=int(num_required),
            replace=True
        )
        data_resampled.append(single_data_resampled)
        labels_resampled.append(single_labels_resampled)

    return np.vstack(data_resampled).astype(data.dtype), np.hstack(labels_resampled).astype(labels.dtype)
开发者ID:DSsoto,项目名称:Sub8,代码行数:31,代码来源:utils.py


示例14: fit

    def fit(self, X, Y):
        num_examples = len(X)
        data_indices = np.arange(num_examples)
        self.data = X
        Y = np.array(Y, dtype=float)

        sample = resample(data_indices, replace=False, n_samples=min(20, num_examples), random_state=0)
        for i in sample:
            y = Y[i]
            self.S.add(i)
            self.y[i] = y
            self.alpha[i] = 0.0
            self.g[i] = y
       

        for i in xrange(5):
            min_delta = 999999999
            for i in data_indices:
                self.process(i, Y[i])
                delta = self.reprocess()
                min_delta = min(min_delta, delta)
            if min_delta < self.tau: break

            data_indices = shuffle(data_indices)

        while True:
            delta = self.reprocess()
            if delta < self.tau: break
开发者ID:woohp,项目名称:ai_tidbits,代码行数:28,代码来源:lasvm.py


示例15: test_mnist

    def test_mnist(self):
        mnist = fetch_mldata('MNIST original')
        X, Y = resample(mnist.data, mnist.target, replace=False, n_samples=1000, random_state=0)
        X = X.astype(float)
        Y = [1 if y == 0 else -1 for y in Y]

        svm = LASVM(C=10, tau=0.001)
        svm.fit(X, Y)

        X_test, Y_test = resample(mnist.data, mnist.target, replace=False, n_samples=300, random_state=2)
        X_test = X_test.astype(float)
        Y_test = [1 if y == 0 else -1 for y in Y_test]
        Y_predict = svm.predict(X_test)
        percent_correct = np.sum(Y_predict == Y_test) / 300.0

        self.assertGreater(percent_correct, 0.95)
开发者ID:woohp,项目名称:ai_tidbits,代码行数:16,代码来源:lasvm.py


示例16: Reduce_scikit_kmeans

def Reduce_scikit_kmeans(img, number_of_colors):
    t0 = time()
    from sklearn.cluster import KMeans
    img_64 = np.array(img, dtype=np.float64) / 255
    w, h, d = tuple(img_64.shape)
    assert d == 3
    image_array = np.reshape(img_64, (w * h, d))

    LOGGER.info("shape=%s", image_array.shape)
    from sklearn.utils import resample
    image_array_sample = resample(
        image_array,
        replace=True,
        n_samples=min([image_array.shape[0], 1000]),
        random_state=1
    )

    kmeans = KMeans(
        n_clusters=number_of_colors,
        random_state=1,
        precompute_distances=True).fit(image_array_sample)

    labels = kmeans.predict(image_array)
    LOGGER.info("ms=%s", ms(t0))

    return kmeans.cluster_centers_, labels
开发者ID:rdefeo,项目名称:image_processing,代码行数:26,代码来源:color.py


示例17: show_bootstrap_statistics

def show_bootstrap_statistics(clf, X, y, features):
    num_features = len(features)

    coefs = []
    for i in range(num_features):
        coefs.append([])

    for _ in range(BOOTSTRAP_ITERATIONS):
        X_sample, y_sample = resample(X, y)
        clf.fit(X_sample, y_sample)
        for i, c in enumerate(get_normalized_coefs(clf)):
            coefs[i].append(c)

    poi_index = features.index('POI')
    building_index = features.index('Building')
    coefs[building_index] = coefs[poi_index]

    intervals = []

    print()
    print('***** Bootstrap statistics *****')
    print('{:<20}{:<20}{:<10}{:<10}'.format('Feature', '95% interval', 't-value', 'Pr(>|t|)'))
    print()
    for i, cs in enumerate(coefs):
        values = np.array(cs)
        lo = np.percentile(values, 2.5)
        hi = np.percentile(values, 97.5)
        interval = '({:.3f}, {:.3f})'.format(lo, hi)
        tv = np.mean(values) / np.std(values)
        pr = (1.0 - t.cdf(x=abs(tv), df=len(values))) * 0.5

        stv = '{:.3f}'.format(tv)
        spr = '{:.3f}'.format(pr)
        print('{:<20}{:<20}{:<10}{:<10}'.format(features[i], interval, stv, spr))
开发者ID:milchakov,项目名称:omim,代码行数:34,代码来源:scoring_model.py


示例18: test_resample_stratify_2dy

def test_resample_stratify_2dy():
    # Make sure y can be 2d when stratifying
    rng = np.random.RandomState(0)
    n_samples = 100
    X = rng.normal(size=(n_samples, 1))
    y = rng.randint(0, 2, size=(n_samples, 2))
    X, y = resample(X, y, n_samples=50, random_state=rng, stratify=y)
    assert y.ndim == 2
开发者ID:daniel-perry,项目名称:scikit-learn,代码行数:8,代码来源:test_utils.py


示例19: bootstrap_auc

def bootstrap_auc(y_c,y_pred,N=100):
    """Bootstrap the AUC score."""
    scores=[]
    for i in xrange(N):
        res_y=resample(np.column_stack([y_c,y_pred]))
        scores.append(roc_auc_score(res_y[:,0],res_y[:,1]))
        
    print 'Score is :', '%.4f' % np.mean(scores),
    print '+-','%.4f' % np.std(scores)
开发者ID:riblidezso,项目名称:mhc_pred,代码行数:9,代码来源:utils.py


示例20: make_pred_prob_plot_data

def make_pred_prob_plot_data(model, df, column):
    dfc = df.copy() 
    rng = np.linspace(df[column].min(), df[column].max())
    probs = []
    for val in rng:
        dfc[column] = val
        pred_probs = model.predict_proba(dfc)[:, 1]
        probs.append([boot_sample.mean() for boot_sample in (resample(pred_probs) for _ in xrange(1000))])
    return rng, np.array(probs).T
开发者ID:jessedow24,项目名称:Fraud_Detection_Case_Study,代码行数:9,代码来源:make_graphs.py



注:本文中的sklearn.utils.resample函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python utils.safe_indexing函数代码示例发布时间:2022-05-27
下一篇:
Python utils.gen_even_slices函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap