• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python fixes.bincount函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中sklearn.utils.fixes.bincount函数的典型用法代码示例。如果您正苦于以下问题:Python bincount函数的具体用法?Python bincount怎么用?Python bincount使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了bincount函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: check_min_samples_leaf

def check_min_samples_leaf(name):
    X, y = hastie_X, hastie_y

    # Test if leaves contain more than leaf_count training examples
    ForestEstimator = FOREST_ESTIMATORS[name]

    # test boundary value
    assert_raises(ValueError, ForestEstimator(min_samples_leaf=-1).fit, X, y)
    assert_raises(ValueError, ForestEstimator(min_samples_leaf=0).fit, X, y)

    est = ForestEstimator(min_samples_leaf=5, n_estimators=1, random_state=0)
    est.fit(X, y)
    out = est.estimators_[0].tree_.apply(X)
    node_counts = bincount(out)
    # drop inner nodes
    leaf_count = node_counts[node_counts != 0]
    assert_greater(np.min(leaf_count), 4, "Failed with {0}".format(name))

    est = ForestEstimator(min_samples_leaf=0.25, n_estimators=1, random_state=0)
    est.fit(X, y)
    out = est.estimators_[0].tree_.apply(X)
    node_counts = np.bincount(out)
    # drop inner nodes
    leaf_count = node_counts[node_counts != 0]
    assert_greater(np.min(leaf_count), len(X) * 0.25 - 1, "Failed with {0}".format(name))
开发者ID:nelson-liu,项目名称:scikit-learn,代码行数:25,代码来源:test_forest.py


示例2: check_min_weight_fraction_leaf

def check_min_weight_fraction_leaf(name, X, y):
    # Test if leaves contain at least min_weight_fraction_leaf of the
    # training set
    ForestEstimator = FOREST_ESTIMATORS[name]
    rng = np.random.RandomState(0)
    weights = rng.rand(X.shape[0])
    total_weight = np.sum(weights)

    # test both DepthFirstTreeBuilder and BestFirstTreeBuilder
    # by setting max_leaf_nodes
    for max_leaf_nodes in (None, 1000):
        for frac in np.linspace(0, 0.5, 6):
            est = ForestEstimator(min_weight_fraction_leaf=frac,
                                  max_leaf_nodes=max_leaf_nodes,
                                  random_state=0)
            if isinstance(est, (RandomForestClassifier,
                                RandomForestRegressor)):
                est.bootstrap = False
            est.fit(X, y, sample_weight=weights)
            out = est.estimators_[0].tree_.apply(X)
            node_weights = bincount(out, weights=weights)
            # drop inner nodes
            leaf_weights = node_weights[node_weights != 0]
            assert_greater_equal(
                np.min(leaf_weights),
                total_weight * est.min_weight_fraction_leaf,
                "Failed with {0} "
                "min_weight_fraction_leaf={1}".format(
                    name, est.min_weight_fraction_leaf))
开发者ID:EddieBurning,项目名称:scikit-learn,代码行数:29,代码来源:test_forest.py


示例3: _make_test_folds

    def _make_test_folds(self, X, y=None, groups=None):
        if self.shuffle:
            rng = check_random_state(self.random_state)
        else:
            rng = self.random_state
        y = np.asarray(y)
        n_samlples = len(X)
        y = ','.join(y).split(',')
        unique_y, y_inversed = np.unique(y, return_inverse=True)
        y_counts = bincount(y_inversed)
        min_groups = np.min(y_counts)
        if np.all(self.n_splits > y_counts):
            raise ValueError("All the n_groups for individual classes"
                             " are less than n_splits=%d."
                             % (self.n_splits))
        if self.n_splits > min_groups:
            warnings.warn(("The least populated class in y has only %d"
                           " members, which is too few. The minimum"
                           " number of groups for any class cannot"
                           " be less than n_splits=%d."
                           % (min_groups, self.n_splits)), Warning)

        # pre-assign each sample to a test fold index using individual KFold
        # splitting strategies for each class so as to respect the balance of
        # classes

        # NOTE: Passing the data corresponding to ith class say X[y==class_i]
        # will break when the data is not 100% stratifiable for all classes.
        # So we pass np.zeroes(max(c, n_splits)) as data to the KFold
        test_folds = iterative_stratification(X, set(y), self.n_splits, rng)
        return test_folds
开发者ID:daniaki,项目名称:ppi_wrangler,代码行数:31,代码来源:cross_validation.py


示例4: _recompute_centers

def _recompute_centers( X, labels, n_clusters):
    """
    Computation of cluster centers / means.

    Parameters
    ----------
    X: array-like, shape (n_samples, n_features)

    labels: array of integers, shape (n_samples)
        Current label assignment

    n_clusters: int
        Number of desired clusters

    Returns
    -------
    centers: array, shape (n_clusters, n_features)
        The resulting centers
    """

    n_samples = X.shape[0]
    n_features = X.shape[1]
   
    # Initialize centers to all zero
    centers = np.zeros((n_clusters, n_features))
    n_samples_in_cluster = bincount(labels, minlength=n_clusters)

    # Compute a center for each label
    # For each label, average over samples and features
    # TODO: IMPLEMENT
        # Take all of the samples in a cluster and average their features

    return centers
开发者ID:AkiraKaneshiro,项目名称:gadsdc,代码行数:33,代码来源:kmeans_exercise.py


示例5: _generate_unsampled_indices

def _generate_unsampled_indices(random_state, n_samples):
    '''Samples out of bag'''
    sample_indices = _generate_sample_indices(random_state, n_samples)
    sample_counts = bincount(sample_indices, minlength=n_samples)
    unsampled_mask = sample_counts == 0
    indices_range = np.arange(n_samples)
    unsampled_indices = indices_range[unsampled_mask]
    return unsampled_indices
开发者ID:thomasbrawner,项目名称:regime_failure,代码行数:8,代码来源:oob_validation.py


示例6: entropy

    def entropy(samples):
        n_samples = len(samples)
        entropy = 0.

        for count in bincount(samples):
            p = 1. * count / n_samples
            if p > 0:
                entropy -= p * np.log2(p)

        return entropy
开发者ID:EddieBurning,项目名称:scikit-learn,代码行数:10,代码来源:test_forest.py


示例7: test_sample_weight

def test_sample_weight():
    """Check sample weighting."""
    # Test that zero-weighted samples are not taken into account
    X = np.arange(100)[:, np.newaxis]
    y = np.ones(100)
    y[:50] = 0.0

    sample_weight = np.ones(100)
    sample_weight[y == 0] = 0.0

    clf = DecisionTreeClassifier(random_state=0)
    clf.fit(X, y, sample_weight=sample_weight)
    assert_array_equal(clf.predict(X), np.ones(100))

    # Test that low weighted samples are not taken into account at low depth
    X = np.arange(200)[:, np.newaxis]
    y = np.zeros(200)
    y[50:100] = 1
    y[100:200] = 2
    X[100:200, 0] = 200

    sample_weight = np.ones(200)

    sample_weight[y == 2] = .51  # Samples of class '2' are still weightier
    clf = DecisionTreeClassifier(max_depth=1, random_state=0)
    clf.fit(X, y, sample_weight=sample_weight)
    assert_equal(clf.tree_.threshold[0], 149.5)

    sample_weight[y == 2] = .50  # Samples of class '2' are no longer weightier
    clf = DecisionTreeClassifier(max_depth=1, random_state=0)
    clf.fit(X, y, sample_weight=sample_weight)
    assert_equal(clf.tree_.threshold[0], 49.5)  # Threshold should have moved

    # Test that sample weighting is the same as having duplicates
    X = iris.data
    y = iris.target

    duplicates = rng.randint(0, X.shape[0], 200)

    clf = DecisionTreeClassifier(random_state=1)
    clf.fit(X[duplicates], y[duplicates])

    sample_weight = bincount(duplicates, minlength=X.shape[0])
    clf2 = DecisionTreeClassifier(random_state=1)
    clf2.fit(X, y, sample_weight=sample_weight)

    internal = clf.tree_.children_left != tree._tree.TREE_LEAF
    assert_array_almost_equal(clf.tree_.threshold[internal],
                              clf2.tree_.threshold[internal])
开发者ID:Niteloser,项目名称:scikit-learn,代码行数:49,代码来源:test_tree.py


示例8: _recompute_centers

def _recompute_centers( X, labels, n_clusters):
    """
    Computation of cluster centers / means.

    Parameters
    ----------
    X: array-like, shape (n_samples, n_features)

    labels: array of integers, shape (n_samples)
        Current label assignment

    n_clusters: int
        Number of desired clusters

    Returns
    -------
    centers: array, shape (n_clusters, n_features)
        The resulting centers
    """

    n_samples = X.shape[0]
    n_features = X.shape[1]
   
    # Initialize centers to all zero
    centers = np.zeros((n_clusters, n_features))
    n_samples_in_cluster = bincount(labels, minlength=n_clusters)


    # Compute a center for each label
    # For each label, average over samples and features
    #TODO: IMPLEMENT
        # Take all of the samples in a cluster and add their features
    # For each sample
    # What label is it? Let's say its label x
    # Add feature i to label X's feature value i
    for sample_idx in xrange(n_samples):
        label = labels[sample_idx]

        centers[label] += X[sample_idx]
        #for j in xrange(n_features):
        #   centers[label[j]] +=X[sample_idx[j]]


    # Normalize by the size of the cluster
    centers /= n_samples_in_cluster[:, np.newaxis]

    return centers
开发者ID:vijayvenkatesh,项目名称:datascienceplusMLcode,代码行数:47,代码来源:kmeans_exercise.py


示例9: check_min_samples_leaf

def check_min_samples_leaf(name, X, y):
    # Test if leaves contain more than leaf_count training examples
    ForestEstimator = FOREST_ESTIMATORS[name]

    # test both DepthFirstTreeBuilder and BestFirstTreeBuilder
    # by setting max_leaf_nodes
    for max_leaf_nodes in (None, 1000):
        est = ForestEstimator(min_samples_leaf=5,
                              max_leaf_nodes=max_leaf_nodes,
                              random_state=0)
        est.fit(X, y)
        out = est.estimators_[0].tree_.apply(X)
        node_counts = bincount(out)
        # drop inner nodes
        leaf_count = node_counts[node_counts != 0]
        assert_greater(np.min(leaf_count), 4,
                       "Failed with {0}".format(name))
开发者ID:EddieBurning,项目名称:scikit-learn,代码行数:17,代码来源:test_forest.py


示例10: _balanced_parallel_build_trees

def _balanced_parallel_build_trees(n_trees, forest, X, y, sample_weight, sample_mask, X_argsorted, seed, verbose):
    """Private function used to build a batch of trees within a job"""
    from sklearn.utils import check_random_state
    from sklearn.utils.fixes import bincount
    import random
    MAX_INT = numpy.iinfo(numpy.int32).max
    random_state = check_random_state(seed)

    trees = []
    for i in xrange(n_trees):
        if verbose > 1:
            print("building tree %d of %d" % (i+1, n_trees))
        seed = random_state.randint(MAX_INT)

        tree = forest._make_estimator(append = False)
        tree.set_params(compute_importances=forest.compute_importances)
        tree.set_params(random_state = check_random_state(seed))

        if forest.bootstrap:
            n_samples = X.shape[0]
            if sample_weight is None:
                curr_sample_weight = numpy.ones((n_samples,), dtype=numpy.float64)
            else:
                curr_sample_weight = sample_weight.copy()

            ty = list(enumerate(y))
            indices = DataUtils.FilterData(ty, val=1, frac=0.5, col=1, indicesToUse=0, indicesOnly=1)[0]
            indices2 = random_state.randint(0, len(indices), len(indices))
            indices = [indices[j] for j in indices2]
            sample_counts = bincount(indices, minlength=n_samples)

            curr_sample_weight *= sample_counts
            curr_sample_mask = sample_mask.copy()
            curr_sample_mask[sample_counts==0] = False

            tree.fit(X, y, sample_weight=curr_sample_weight, sample_mask=curr_sample_mask, X_argsorted=X_argsorted, check_input=False)
            tree.indices = curr_sample_mask
        else:
            tree.fit(X, y, sample_weight=sample_weight, sample_mask=sample_mask, X_argsorted=X_argsorted, check_input=False)
        trees.append(tree)
    return trees
开发者ID:ryancoleman,项目名称:TDT-tutorial-2014,代码行数:41,代码来源:random_forest_functions.py


示例11: _recompute_centers

def _recompute_centers( X, labels, n_clusters):
    """
    Computation of cluster centers / means.

    Parameters
    ----------
    X: array-like, shape (n_samples, n_features)

    labels: array of integers, shape (n_samples)
        Current label assignment

    n_clusters: int
        Number of desired clusters

    Returns
    -------
    centers: array, shape (n_clusters, n_features)
        The resulting centers
    """

    n_samples = X.shape[0]
    n_features = X.shape[1]
   
    # Initialize centers to all zero
    centers = np.zeros((n_clusters, n_features))
    n_samples_in_cluster = bincount(labels, minlength=n_clusters)


    # Compute a center for each label
    # For each label, average over samples and features
    #TODO: IMPLEMENT
    # 1. For each sample
    # 2. What label is it? Let's say its label is 'label'
    # 3. Add feature X's feature i to centers[label] feature value i

    # Normalize by the size of the cluster
    centers /= n_samples_in_cluster[:, np.newaxis]

    return centers
开发者ID:asbhat,项目名称:Data-Science-Class,代码行数:39,代码来源:template_kmeans_exercise.py


示例12: _iter_indices

    def _iter_indices(self):
        rng = np.random.RandomState(self.random_state)
        cls_count = bincount(self.y_indices)

        for n in range(self.n_iter):
            train = []
            test = []

            for i, cls in enumerate(self.classes):
                sample_size = int(cls_count[i]*(1-self.test_size))
                randint = rng.randint(cls_count[i], size=sample_size)
                aidx = np.where((self.y == cls))[0]
                iidx = aidx[randint]
                oidx = aidx[list(set(range(cls_count[i])).difference(set(randint)))]

                train.extend(iidx)
                test.extend(oidx)

            train = rng.permutation(train)
            test = rng.permutation(test)

            yield train, test
开发者ID:Anhmike,项目名称:Kaggle_HomeDepot,代码行数:22,代码来源:extreme_ensemble_selection.py


示例13: _recompute_centers

def _recompute_centers( X, labels, n_clusters):
    """
    Computation of cluster centers / means.

    Parameters
    ----------
    X: array-like, shape (n_samples, n_features)

    labels: array of integers, shape (n_samples)
        Current label assignment

    n_clusters: int
        Number of desired clusters

    Returns
    -------
    centers: array, shape (n_clusters, n_features)
        The resulting centers
    """

    n_samples = X.shape[0]
    n_features = X.shape[1]
   
    # Initialize centers to all zero
    centers = np.zeros((n_clusters, n_features))
    n_samples_in_cluster = bincount(labels, minlength=n_clusters)


    # Compute a center for each label
    # For each label, average over samples and features
    #TODO: IMPLEMENT
    for i in range(n_samples):
        for j in range(n_features):
            centers[labels[i], j] += X[i, j]

    # Normalize by the size of the cluster
    centers /= n_samples_in_cluster[:, np.newaxis]

    return centers
开发者ID:GusSand,项目名称:GADS7,代码行数:39,代码来源:kmeans_exercise_final.py


示例14: test_sample_weight

def test_sample_weight():
    """Check sample weighting."""
    # Test that zero-weighted samples are not taken into account
    X = np.arange(100)[:, np.newaxis]
    y = np.ones(100)
    y[:50] = 0.0

    sample_weight = np.ones(100)
    sample_weight[y == 0] = 0.0

    clf = tree.DecisionTreeClassifier()
    clf.fit(X, y, sample_weight=sample_weight)
    assert_array_equal(clf.predict(X), np.ones(100))

    # Test that low weighted samples are not taken into account at low depth
    X = np.arange(200)[:, np.newaxis]
    y = np.zeros(200)
    y[50:100] = 1
    y[100:200] = 2
    X[100:200, 0] = 200

    sample_weight = np.ones(200)

    sample_weight[y == 2] = .51  # Samples of class '2' are still weightier
    clf = tree.DecisionTreeClassifier(max_depth=1)
    clf.fit(X, y, sample_weight=sample_weight)
    assert_equal(clf.tree_.threshold[0], 149.5)

    sample_weight[y == 2] = .50  # Samples of class '2' are no longer weightier
    clf = tree.DecisionTreeClassifier(max_depth=1)
    clf.fit(X, y, sample_weight=sample_weight)
    assert_equal(clf.tree_.threshold[0], 49.5)  # Threshold should have moved

    # Test that sample weighting is the same as having duplicates
    X = iris.data
    y = iris.target

    duplicates = rng.randint(0, X.shape[0], 1000)

    clf = tree.DecisionTreeClassifier(random_state=1)
    clf.fit(X[duplicates], y[duplicates])

    from sklearn.utils.fixes import bincount
    sample_weight = bincount(duplicates, minlength=X.shape[0])
    clf2 = tree.DecisionTreeClassifier(random_state=1)
    clf2.fit(X, y, sample_weight=sample_weight)

    internal = clf.tree_.children_left != tree._tree.TREE_LEAF
    assert_array_equal(clf.tree_.threshold[internal],
                       clf2.tree_.threshold[internal])

    # Test negative weights
    X = iris.data
    y = iris.target

    sample_weight = -np.ones(X.shape[0])
    clf = tree.DecisionTreeClassifier(random_state=1)
    assert_raises(ValueError, clf.fit, X, y, sample_weight=sample_weight)

    sample_weight = np.ones(X.shape[0])
    sample_weight[0] = -1
    clf = tree.DecisionTreeClassifier(random_state=1)
    clf.fit(X, y, sample_weight=sample_weight)

    # Check that predict_proba returns valid probabilities in the presence of
    # samples with negative weight
    X = iris.data
    y = iris.target

    sample_weight = rng.normal(.5, 1.0, X.shape[0])
    clf = tree.DecisionTreeClassifier(random_state=1)
    clf.fit(X, y, sample_weight=sample_weight)
    proba = clf.predict_proba(X)
    assert (proba >= 0).all() and (proba <= 1).all()
开发者ID:Calvin-O,项目名称:scikit-learn,代码行数:74,代码来源:test_tree.py


示例15: sensitivity_specificity_support


#.........这里部分代码省略.........
            raise ValueError("Target is %s but average='binary'. Please "
                             "choose another average setting." % y_type)
    elif pos_label not in (None, 1):
        warnings.warn("Note that pos_label (set to %r) is ignored when "
                      "average != 'binary' (got %r). You may use "
                      "labels=[pos_label] to specify a single positive class."
                      % (pos_label, average), UserWarning)

    if labels is None:
        labels = present_labels
        n_labels = None
    else:
        n_labels = len(labels)
        labels = np.hstack(
            [labels, np.setdiff1d(
                present_labels, labels, assume_unique=True)])

    # Calculate tp_sum, pred_sum, true_sum ###

    if y_type.startswith('multilabel'):
        raise ValueError('imblearn does not support multilabel')
    elif average == 'samples':
        raise ValueError("Sample-based precision, recall, fscore is "
                         "not meaningful outside multilabel "
                         "classification. See the accuracy_score instead.")
    else:
        le = LabelEncoder()
        le.fit(labels)
        y_true = le.transform(y_true)
        y_pred = le.transform(y_pred)
        sorted_labels = le.classes_

        # labels are now from 0 to len(labels) - 1 -> use bincount
        tp = y_true == y_pred
        tp_bins = y_true[tp]
        if sample_weight is not None:
            tp_bins_weights = np.asarray(sample_weight)[tp]
        else:
            tp_bins_weights = None

        if len(tp_bins):
            tp_sum = bincount(
                tp_bins, weights=tp_bins_weights, minlength=len(labels))
        else:
            # Pathological case
            true_sum = pred_sum = tp_sum = np.zeros(len(labels))
        if len(y_pred):
            pred_sum = bincount(
                y_pred, weights=sample_weight, minlength=len(labels))
        if len(y_true):
            true_sum = bincount(
                y_true, weights=sample_weight, minlength=len(labels))

        # Compute the true negative
        tn_sum = y_true.size - (pred_sum + true_sum - tp_sum)

        # Retain only selected labels
        indices = np.searchsorted(sorted_labels, labels[:n_labels])
        tp_sum = tp_sum[indices]
        true_sum = true_sum[indices]
        pred_sum = pred_sum[indices]
        tn_sum = tn_sum[indices]

    if average == 'micro':
        tp_sum = np.array([tp_sum.sum()])
        pred_sum = np.array([pred_sum.sum()])
        true_sum = np.array([true_sum.sum()])
        tn_sum = np.array([tn_sum.sum()])

    # Finally, we have all our sufficient statistics. Divide! #

    with np.errstate(divide='ignore', invalid='ignore'):
        # Divide, and on zero-division, set scores to 0 and warn:

        # Oddly, we may get an "invalid" rather than a "divide" error
        # here.
        specificity = _prf_divide(tn_sum, tn_sum + pred_sum - tp_sum,
                                  'specificity', 'predicted', average,
                                  warn_for)
        sensitivity = _prf_divide(tp_sum, true_sum, 'sensitivity', 'true',
                                  average, warn_for)

    # Average the results

    if average == 'weighted':
        weights = true_sum
        if weights.sum() == 0:
            return 0, 0, None
    elif average == 'samples':
        weights = sample_weight
    else:
        weights = None

    if average is not None:
        assert average != 'binary' or len(specificity) == 1
        specificity = np.average(specificity, weights=weights)
        sensitivity = np.average(sensitivity, weights=weights)
        true_sum = None  # return no support

    return sensitivity, specificity, true_sum
开发者ID:kellyhennigan,项目名称:cueexp_scripts,代码行数:101,代码来源:classification.py


示例16: _parallel_build_estimators

def _parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight,
                               seeds, verbose):
    """Private function used to build a batch of estimators within a job."""
    # Retrieve settings
    n_samples, n_features = X.shape
    max_samples = ensemble.max_samples
    max_features = ensemble.max_features

    if (not isinstance(max_samples, (numbers.Integral, np.integer)) and
            (0.0 < max_samples <= 1.0)):
        max_samples = int(max_samples * n_samples)

    if (not isinstance(max_features, (numbers.Integral, np.integer)) and
            (0.0 < max_features <= 1.0)):
        max_features = int(max_features * n_features)

    bootstrap = ensemble.bootstrap
    bootstrap_features = ensemble.bootstrap_features
    support_sample_weight = ("sample_weight" in
                             getargspec(ensemble.base_estimator_.fit)[0])

    # Build estimators
    estimators = []
    estimators_samples = []
    estimators_features = []

    for i in range(n_estimators):
        if verbose > 1:
            print("building estimator %d of %d" % (i + 1, n_estimators))

        random_state = check_random_state(seeds[i])
        seed = check_random_state(random_state.randint(MAX_INT))
        estimator = ensemble._make_estimator(append=False)

        try: # Not all estimator accept a random_state
            estimator.set_params(random_state=seed)
        except ValueError:
            pass

        # Draw features
        if bootstrap_features:
            features = random_state.randint(0, n_features, max_features)
        else:
            features = sample_without_replacement(n_features,
                                                  max_features,
                                                  random_state=random_state)

        # Draw samples, using sample weights, and then fit
        if support_sample_weight:
            if sample_weight is None:
                curr_sample_weight = np.ones((n_samples,))
            else:
                curr_sample_weight = sample_weight.copy()

            if bootstrap:
                indices = random_state.randint(0, n_samples, max_samples)
                sample_counts = bincount(indices, minlength=n_samples)
                curr_sample_weight *= sample_counts

            else:
                not_indices = sample_without_replacement(
                    n_samples,
                    n_samples - max_samples,
                    random_state=random_state)

                curr_sample_weight[not_indices] = 0

            estimator.fit(X[:, features], y, sample_weight=curr_sample_weight)
            samples = curr_sample_weight > 0.

        # Draw samples, using a mask, and then fit
        else:
            if bootstrap:
                indices = random_state.randint(0, n_samples, max_samples)
            else:
                indices = sample_without_replacement(n_samples,
                                                     max_samples,
                                                     random_state=random_state)

            sample_counts = bincount(indices, minlength=n_samples)

            estimator.fit((X[indices])[:, features], y[indices])
            samples = sample_counts > 0.

        estimators.append(estimator)
        estimators_samples.append(samples)
        estimators_features.append(features)

    return estimators, estimators_samples, estimators_features
开发者ID:orazaro,项目名称:kgml,代码行数:89,代码来源:bag.py


示例17: _document_frequency

def _document_frequency(X):
    """Count the number of non-zero values for each feature in sparse X."""
    if sp.isspmatrix_csr(X):
        return bincount(X.indices, minlength=X.shape[1])
    else:
        return np.diff(sp.csc_matrix(X, copy=False).indptr)
开发者ID:NPSDC,项目名称:Online-News-Clustering-SMAI-PROJECT-,代码行数:6,代码来源:featureExtraction.py


示例18: _iter_indices

    def _iter_indices(self, frame, y):
        """Iterate the indices with stratification.

        Parameters
        ----------

        frame : H2OFrame
            The frame to split

        y : string
            The column to stratify.

        Returns
        -------

        train : np.ndarray, shape=(n_samples,)
            The train indices

        test : np.ndarray, shape=(n_samples,)
            The test indices
        """
        n_samples = frame.shape[0]
        n_train, n_test = _validate_shuffle_split(n_samples,
                                                  self.test_size, self.train_size)

        # need to validate y...
        y = _val_y(y)
        target = np.asarray(frame[y].as_data_frame(use_pandas=True)[y].tolist())

        classes, y_indices = np.unique(target, return_inverse=True)
        n_classes = classes.shape[0]

        class_counts = bincount(y_indices)
        if np.min(class_counts) < 2:
            raise ValueError('The least populated class in y has only 1 '
                             'member, which is too few. The minimum number of labels '
                             'for any class cannot be less than 2.')

        if n_train < n_classes:
            raise ValueError('The train_size=%d should be greater than or '
                             'equal to the number of classes=%d' % (n_train, n_classes))

        if n_test < n_classes:
            raise ValueError('The test_size=%d should be greater than or '
                             'equal to the number of classes=%d' % (n_test, n_classes))

        rng = check_random_state(self.random_state)
        p_i = class_counts / float(n_samples)
        n_i = np.round(n_train * p_i).astype(int)
        t_i = np.minimum(class_counts - n_i, np.round(n_test * p_i).astype(int))

        for _ in range(self.n_splits):
            train = []
            test = []

            for i, class_i in enumerate(classes):
                permutation = rng.permutation(class_counts[i])
                perm_indices_class_i = np.where((target == class_i))[0][permutation]

                train.extend(perm_indices_class_i[:n_i[i]])
                test.extend(perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]])

            # Might end up here with less samples in train and test than we asked
            # for, due to rounding errors.
            if len(train) + len(test) < n_train + n_test:
                missing_indices = np.where(bincount(train + test, minlength=len(target)) == 0)[0]
                missing_indices = rng.permutation(missing_indices)
                n_missing_train = n_train - len(train)
                n_missing_test = n_test - len(test)

                if n_missing_train > 0:
                    train.extend(missing_indices[:n_missing_train])
                if n_missing_test > 0:
                    test.extend(missing_indices[-n_missing_test:])

            train = rng.permutation(train)
            test = rng.permutation(test)

            yield train, test
开发者ID:tgsmith61591,项目名称:skutil,代码行数:79,代码来源:split.py


示例19: grow_forest

def grow_forest(forest, X, y, seeds, labels=None):
    """Grow a forest of random trees"""
    # Convert data
    X, = check_arrays(X, dtype=DTYPE, sparse_format="dense")
    # Make a list container for grown trees
    n_trees = forest.n_estimators
    trees = []
    # For each tree in the forest
    for i in range(n_trees):
        # Make a np.random.RandomState instance from the tree's planting seed
        random_state = check_random_state(seeds[i])
        # generate a random seed for a branching seed
        seed = random_state.randint(MAX_INT)
        # Make a decision tree object
        tree = forest._make_estimator(append=False)
        # Init the tree's RandomState instance with generated seed
        # this will randomize what features the tree will use
        tree.set_params(random_state=check_random_state(seed))
        # If we are bootstraping
        if forest.bootstrap:
            # If we are given labels
            if labels is not None:
                # Then need to bootstrap via labels
                # We can do this by using StratifiedShuffleSplit
                # to gain a random sample from each lable
                sss = cross_validation.StratifiedShuffleSplit(labels, 
                                             n_iter=1, 
                                             test_size=np.unique(labels).size, 
                                             random_state=check_random_state(seed))
                # Then we'll bootstrap our X and y for the lable samples chosen
                for train, test in sss:
                    X_lbs = X[test]
                    y_lbs = y[test]
                    break
                
                # Then get the number of samples
                n_samples = X_lbs.shape[0]
                # To generate a uniform sample weight
                curr_sample_weight = np.ones((n_samples,), dtype=np.float64)
                # Then randomly choses n_samples from all samples with replacement 
                indices = random_state.randint(0, n_samples, n_samples)
                # Use this method of bincount to make a randome benning histogram
                # that will sum up to n_samples
                sample_counts = bincount(indices, minlength=n_samples)
                # Apply these randomized counts to the old uniform weights
                curr_sample_weight *= sample_counts
                # Fit the tree using these new sample weights
                tree.fit(X_lbs, y_lbs, sample_weight=curr_sample_weight, check_input=False)
                # Then set the indices of the tree only to the samples that had non-zero weights
                tree.indices_ = sample_counts > 0.
            else:
                # Then get the number of samples
                n_samples = X.shape[0]
                # To generate a uniform sample weight
                curr_sample_weight = np.ones((n_samples,), dtype=np.float64)
                # Then randomly choses n_samples from all samples with replacement 
                indices = random_state.randint(0, n_samples, n_samples)
                # Use this method of bincount to make a randome benning histogram
                # that will sum up to n_samples
                sample_counts = bincount(indices, minlength=n_samples)
                # Apply these randomized counts to the old uniform weights
                curr_sample_weight *= sample_counts
                # Fit the tree using these new sample weights
                tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
                # Then set the indices of the tree only to the samples that had non-zero weights
                tree.indices_ = sample_counts > 0.
        # If we aren't bootstraping
        else:
            # This just fit the data with no random weights
            tree.fit(X, y, check_input=False)
        # Add the grown tree to the container 
        trees.append(tree)
    # return all of the trained trees
    return trees
开发者ID:ruffsl,项目名称:CS7616P1,代码行数:74,代码来源:labeled_bootstraping.py


示例20: _make_test_folds

    def _make_test_folds(self, frame, y):
        if self.shuffle:
            rng = check_random_state(self.random_state)
        else:
            rng = self.random_state

        # validate that it's a string
        y = _val_y(y)  # gets a string back or None
        if y is None:
            raise ValueError('H2OStratifiedKFold requires a target name (got None)')

        target = frame[y].as_data_frame(use_pandas=True)[y].values
        n_samples = target.shape[0]
        unique_y, y_inversed = np.unique(target, return_inverse=True)
        y_counts = bincount(y_inversed)
        min_labels = np.min(y_counts)

        if np.all(self.n_folds > y_counts):
            raise ValueError(('All the n_labels for individual classes'
                              ' are less than %d folds.'
                              % self.n_folds), Warning)
        if self.n_folds > min_labels:
            warnings.warn(('The least populated class in y has only %d'
                           ' members, which is too few. The minimum'
                           ' number of labels for any class cannot'
                           ' be less than n_folds=%d.'
                           % (min_labels, self.n_folds)), Warning)

        # NOTE FROM SKLEARN:

        # pre-assign each sample to a test fold index using individual KFold
        # splitting strategies for each class so as to respect the balance of
        # classes
        # NOTE: Passing the data corresponding to ith class say X[y==class_i]
        # will break when the data is not 100% stratifiable for all classes.
        # So we pass np.zeroes(max(c, n_folds)) as data to the KFold.

        # Remember, however that we might be using the old-fold KFold which doesn't
        # have a split method...
        if SK18:
            per_cls_cvs = [
                KFold(self.n_folds,  # using sklearn's KFold here
                      shuffle=self.shuffle,
                      random_state=rng).split(np.zeros(max(count, self.n_folds)))
                for count in y_counts
                ]
        else:
            per_cls_cvs = [
                KFold(max(count, self.n_folds),  # using sklearn's KFold here
                      self.n_folds,
                      shuffle=self.shuffle,
                      random_state=rng)
                for count in y_counts
                ]

        test_folds = np.zeros(n_samples, dtype=np.int)
        for test_fold_indices, per_cls_splits in enumerate(zip(*per_cls_cvs)):
            for cls, (_, test_split) in zip(unique_y, per_cls_splits):
                cls_test_folds = test_folds[target == cls]

                # the test split can be too big because we used
                # KFold(...).split(X[:max(c, n_folds)]) when data is not 100%
                # stratifiable for all the classes
                # (we use a warning instead of raising an exception)
                # If this is the case, let's trim it:
                test_split = test_split[test_split < len(cls_test_folds)]
                cls_test_folds[test_split] = test_fold_indices
                test_folds[target == cls] = cls_test_folds

        return test_folds
开发者ID:tgsmith61591,项目名称:skutil,代码行数:70,代码来源:split.py



注:本文中的sklearn.utils.fixes.bincount函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python fixes.unique函数代码示例发布时间:2022-05-27
下一篇:
Python extmath.weighted_mode函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap