• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python preprocessing.Binarizer类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中sklearn.preprocessing.Binarizer的典型用法代码示例。如果您正苦于以下问题:Python Binarizer类的具体用法?Python Binarizer怎么用?Python Binarizer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



在下文中一共展示了Binarizer类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: cv_mean_std_array

def cv_mean_std_array(X, y, alphas, ks, n_a, n_k, cv=20):
    n = n_alphas*n_ks
    cv_mean = np.empty(n)
    cv_std = np.empty(n)
    regressors = pd.DataFrame()

    binarizer = Binarizer(threshold=1400)
    y_binary = binarizer.transform(y).transpose().ravel() 

    itt_counter = 0
    print 'size n_a: %d n_k: %d' %(n_a, n_k)
    for i in range (0, n_a):
    	print 'reg. column : %d' %(i*n_k)
    	temp_string = 'alpha=%f' %alphas[i*n_k]
    	print temp_string
    	print regressors.shape
    	df_temp = pd.DataFrame()
        print 'computing for alpha = %f' %(alphas[n_ks*i])
        X_lasso, df_temp[temp_string] = df_Lasso(X, y, alphas[i*n_k])
        regressors = pd.concat([regressors,df_temp], ignore_index=True, axis=1)
        for j in range(0, n_k):
            print 'i:%d, j:%d' %(i, j)
            print 'computing for alpha = %f and k = %f' %(alphas[n_ks*i+j], ks[n_ks*i+j])
            print 'X_lasso shape:' 
            print X_lasso.shape
            cv_mean[n_ks*i+j], cv_std[n_ks*i+j] = knn_cv_mean_and_std(X_lasso, y_binary, alphas[n_ks*i+j], ks[n_ks*i+j], cv=cv)
            itt_counter = itt_counter + 1
            print 'completed %dth iteration of knn cv mean:%f std:%f, at pos:%d' % (itt_counter, cv_mean[n_ks*i+j], cv_std[n_ks*i+j], n_ks*i+j)
    return cv_mean, cv_std, regressors
开发者ID:AveryLiu,项目名称:Data-Mining,代码行数:29,代码来源:kNN-iterator.py


示例2: cv_mean_std_array

def cv_mean_std_array(X, y, alphas, n_a, cv=20):
    binarizer = Binarizer(threshold=1400)
    y_binary = binarizer.transform(y).transpose().ravel() 
    cv_ols_means, cv_ols_stds, cv_lasso_means, cv_lasso_stds, cv_ridge_means, cv_ridge_stds = np.empty(n_a), np.empty(n_a), np.empty(n_a), np.empty(n_a), np.empty(n_a), np.empty(n_a)
    
    for i in range (0, n_a):
    	print 'computing for alpha=%f' %alphas[i]
        cv_ols_means[i], cv_ols_stds[i], cv_lasso_means[i], cv_lasso_stds[i], cv_ridge_means[i], cv_ridge_stds[i] = lm_cv_mean_and_std(X, , alphas[i])
        print 'successfully computed iteration %d' %i
    return cv_ols_means, cv_ols_stds, cv_lasso_means, cv_lasso_stds, cv_ridge_means, cv_ridge_stds
开发者ID:AveryLiu,项目名称:Data-Mining,代码行数:10,代码来源:linear-models-iterator.py


示例3: initialize

def initialize():
    images, labels = load_mnist_data()

    binarizer = Binarizer().fit(images)
    images_binarized = binarizer.transform(images)

    knn = KNeighborsClassifier(n_neighbors=3, metric='jaccard')
    knn.fit(images_binarized, labels)

    return knn
开发者ID:mikokm,项目名称:DigitGuesser,代码行数:10,代码来源:classifiers.py


示例4: binarizeMatrix

def binarizeMatrix(dataMatrix, threshold):
    """
    Transforms all the inputs to either 0/1 . <0 Maps to 0. >1 Maps 1. [0,1] depends on the threshold you set between [0,1]
    """

    binarizer = Binarizer(threshold=threshold)

    dataMatrix = binarizer.fit_transform(dataMatrix)

    return dataMatrix
开发者ID:Gliganu,项目名称:DMC_Fashion_2016,代码行数:10,代码来源:DatasetManipulator.py


示例5: test_binarizer

def test_binarizer():
    X_ = np.array([[1, 0, 5], [2, 3, 0]])

    for init in (np.array, sp.csr_matrix, sp.csc_matrix):

        X = init(X_.copy())

        binarizer = Binarizer(threshold=2.0, copy=True)
        X_bin = toarray(binarizer.transform(X))
        assert_equal(np.sum(X_bin == 0), 4)
        assert_equal(np.sum(X_bin == 1), 2)
        X_bin = binarizer.transform(X)
        assert_equal(type(X), type(X_bin))

        binarizer = Binarizer(copy=True).fit(X)
        X_bin = toarray(binarizer.transform(X))
        assert_true(X_bin is not X)
        assert_equal(np.sum(X_bin == 0), 2)
        assert_equal(np.sum(X_bin == 1), 4)

        binarizer = Binarizer(copy=True)
        X_bin = binarizer.transform(X)
        assert_true(X_bin is not X)
        X_bin = toarray(X_bin)
        assert_equal(np.sum(X_bin == 0), 2)
        assert_equal(np.sum(X_bin == 1), 4)

        binarizer = Binarizer(copy=False)
        X_bin = binarizer.transform(X)
        assert_true(X_bin is X)
        X_bin = toarray(X_bin)
        assert_equal(np.sum(X_bin == 0), 2)
        assert_equal(np.sum(X_bin == 1), 4)
开发者ID:Big-Data,项目名称:scikit-learn,代码行数:33,代码来源:test_preprocessing.py


示例6: test_binarizer_vs_sklearn

def test_binarizer_vs_sklearn():
    # Compare msmbuilder.preprocessing.Binarizer
    # with sklearn.preprocessing.Binarizer

    binarizerr = BinarizerR()
    binarizerr.fit(np.concatenate(trajs))

    binarizer = Binarizer()
    binarizer.fit(trajs)

    y_ref1 = binarizerr.transform(trajs[0])
    y1 = binarizer.transform(trajs)[0]

    np.testing.assert_array_almost_equal(y_ref1, y1)
开发者ID:Eigenstate,项目名称:msmbuilder,代码行数:14,代码来源:test_preprocessing.py


示例7: wine_quality_white

def wine_quality_white():
    # white wine quality dataset

    filename = '../../data/raw/mldata/winequality-white.csv'

    # The data corresponds to the 11 first column of the csv file
    data = np.loadtxt(filename, usecols=tuple(range(11)), delimiter=';', dtype=float)
    # Read the label
    # We need to binarise the label using a threshold at 4
    bn = Binarizer(threshold=4)
    label = bn.fit_transform(np.loadtxt(filename, usecols=(11,), delimiter=';', dtype=int))
    # We need to inverse the label -> 1=0 and 0=1
    label = np.ravel(np.abs(label - 1))
    
    np.savez('../../data/clean/uci-wine-quality-white.npz', data=data, label=label)
开发者ID:I2Cvb,项目名称:data_balancing,代码行数:15,代码来源:conversion.py


示例8: fit

 def fit(self, X, y=None):
     """
     Обучает бинаризатор на данных
     """
     # print("Fitting binarizer...")
     methods = Binarizer._UNSUPERVISED_METHODS + Binarizer._SUPERVISED_METHODS
     if self.method not in methods:
         raise ValueError("Method should be one of {0}".format(", ".join(methods)))
     X = check_array(X, accept_sparse=['csr', 'csc'])
     if issparse(X):
         X = X.tocsc()
     if self.method in Binarizer._UNSUPERVISED_METHODS:
         self._fit_unsupervised(X)
         self.joint_thresholds_ = self.thresholds_
         self.joint_scores_ = self.scores_
     else:
         if y is None:
             raise ValueError("y must not be None for supervised binarizers.")
         # вынести в отдельную функцию
         # y = np.array(y)
         # if len(y.shape) == 1:
         #     self.classes_, y = np.unique(y, return_inverse=True)
         #     nclasses = self.classes_.shape[0]
         #     Y_new = np.zeros(shape=(y.shape[0], nclasses), dtype=int)
         #     Y_new[np.arange(y.shape[0]), y] = 1
         # else:
         #     self.classes_ = np.arange(y.shape[1])
         #     Y_new = y
         label_binarizer = SK_LabelBinarizer()
         Y_new = label_binarizer.fit_transform(y)
         self.classes_ = label_binarizer.classes_
         if X.shape[0] != Y_new.shape[0]:
             raise ValueError("X and y have incompatible shapes.\n"
                              "X has %s samples, but y has %s." %
                              (X.shape[0], Y_new.shape[0]))
         self._fit_supervised(X, Y_new)
         if len(self.classes_) <= 2:
             self.joint_thresholds_ = self.thresholds_[:, 0]
             self.joint_scores_ = self.scores_[:, 0]
         else:
             min_class_scores = np.min(self.scores_, axis=0)
             max_class_scores = np.max(self.scores_, axis=0)
             diffs = max_class_scores - min_class_scores
             diffs[np.where(diffs == 0)] = 1
             normalized_scores = (self.scores_ - min_class_scores) / diffs
             # находим для каждого признака тот класс, для которого он наиболее полезен
             # НАВЕРНО, МОЖНО СДЕЛАТЬ ПО_ДРУГОМУ
             optimal_indexes = np.argmax(normalized_scores, axis=1)
             nfeat = self.thresholds_.shape[0]
             # в качестве порога бинаризации каждого признака
             # берём значение для класса, где он наиболее полезен
             self.joint_thresholds_ = self.thresholds_[np.arange(nfeat), optimal_indexes]
             self.joint_scores_ = self.scores_[np.arange(nfeat), optimal_indexes]
     # передаём пороги в sklearn.SK_Binarizer
     self.binarize_transformer_ = SK_Binarizer(self.joint_thresholds_)
     return self
开发者ID:AlexeySorokin,项目名称:pyparadigm,代码行数:56,代码来源:feature_selector.py


示例9: do_logreg

def do_logreg():
    from sklearn.preprocessing import Binarizer, scale
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import accuracy_score,classification_report
    from sklearn.cross_validation import train_test_split
    from sklearn.cross_validation import cross_val_score
    from sklearn.grid_search import GridSearchCV
    from scipy.stats import expon
    import pandas
    ### load data
    col_names=['mpg','cylinders','displacement','horsepower','weight',
               'acceleration','model_year','origin','car_name']
    df=pandas.read_csv('auto_mpg.csv')
    df.columns=col_names
    df=df.drop('car_name',1)
    
    lr=LogisticRegression()
    bn=Binarizer(threshold=df['mpg'].mean())
    print "Performing binarization of the mpg variable into above/below average classes"
    target=bn.fit_transform(df['mpg'])
    data=df.drop('mpg',1)
    data=scale(data)
    print "Splitting into training and test sets"
    data_train,data_test,target_train,target_test=train_test_split(data,target,test_size=0.5,random_state=0)

    grid=[0.001, 0.01, 0.1, 1, 10, 100, 1000]
    print 'Searching for optimal C in {} using {}-fold validation on test set '.format(grid,nfolds)
    tuned_parameters=[{'C':grid}]
    clf=GridSearchCV(lr,tuned_parameters,cv=nfolds,scoring='accuracy')
    clf.fit(data_train,target_train)
    for params, mean_score,_ in clf.grid_scores_:
        print "{}: Mean accuracy {}".format(params,mean_score)

    
    print  """Cross-validating above/below average mpg prediction
        using {}-fold validation on the test dataset.
        Using the best estimator: {}
        """.format(nfolds,clf.best_estimator_)
        
    mean_cross=np.mean(cross_val_score(clf.best_estimator_,data_test,target_test,cv=nfolds))

    print "Mean cross-validated accuracy after optimization is: {}".format(mean_cross)
开发者ID:jmccutchan,项目名称:GA_homework,代码行数:42,代码来源:sklearn_logreg.py


示例10: us_crime

def us_crime():
    # US crime dataset

    filename = '../../data/raw/mldata/communities.data'

    # The missing data will be consider as NaN
    # Only use 122 continuous features
    tmp_data = np.genfromtxt(filename, delimiter = ',')
    tmp_data = tmp_data[:, 5:]

    # replace missing value by the mean
    imp = Imputer(verbose = 1)
    tmp_data = imp.fit_transform(tmp_data)

    # extract the data to be saved
    data = tmp_data[:, :-1]
    bn = Binarizer(threshold=0.65)
    label = np.ravel(bn.fit_transform(tmp_data[:, -1]))

    np.savez('../../data/clean/uci-us-crime.npz', data=data, label=label)
开发者ID:I2Cvb,项目名称:data_balancing,代码行数:20,代码来源:conversion.py


示例11: OneHotEncoder

from sklearn.preprocessing import Binarizer, LabelEncoder, OneHotEncoder

onehot_encoder = OneHotEncoder()
label_encoder = LabelEncoder()

x = ['a', 'b', 'c']

label_x = label_encoder.fit_transform(x).reshape([len(x), 1])
print(label_x)
print(onehot_encoder.fit_transform(label_x).toarray())

binarizer = Binarizer(threshold=1.0).fit(label_x)
print(binarizer.transform(label_x))
开发者ID:yaochitc,项目名称:learning_libraries,代码行数:13,代码来源:features.py


示例12: Binarizer


# In[3]:

# Import csv data
raw_data = pd.read_csv('OnlineNewsPopularity_wLabels_deleteNoise.csv').iloc[:, 1:]      # read in csv, omit the first column of url
raw_data = raw_data.iloc[:, :-1] 
news_data = raw_data.iloc[:, :-1]      # Take up to the second last column
news_labels = raw_data.iloc[:, -1]      # Take shares column for labels

# Binarize
print '\nBinary Threshold:'
binary_threshold = np.median(raw_data[' shares'])
news_data = news_data.drop(' n_non_stop_words', 1)
print binary_threshold
binarizer = Binarizer(threshold=binary_threshold)
y_binary = binarizer.transform(news_labels).transpose().ravel() 


# In[ ]:

# Discretize


# In[25]:

# Decision Tree
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
print 'Decision Tree Classifier Accuracy Rate'
tree_score = cross_val_score(tree, news_data, y_binary, cv=10)
开发者ID:AveryLiu,项目名称:Data-Mining,代码行数:29,代码来源:DecisionTree&NB.py


示例13: DictVectorizer

news_data = extracted_data.iloc[:, :-1]      # Take up to the second last column
news_labels = extracted_data[' shares']      # Take shares column for labels

# Data Preprocessing
news_data_transpose = news_data.transpose()
data_into_dict = news_data_transpose.to_dict()
list_data = [v for k, v in data_into_dict.iteritems()]

# Encode
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer()
transformed_data = dv.fit_transform(list_data).toarray()

# Label Encoder - Binarization
from sklearn.preprocessing import Binarizer
binarizer = Binarizer(threshold=1400)                           # Threshold at 1400 because median of shares is 1400
transformed_labels = binarizer.transform(news_labels)
transformed_labels = transformed_labels.transpose().ravel()     # .ravel() is to fix "Too many array indices error"
                                                                # Could be a scikit or pandas bug
############## Classification #################

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC

# Decision Tree Classifier
tree = DecisionTreeClassifier()
knn = KNeighborsClassifier()
gnb = GaussianNB()
开发者ID:AveryLiu,项目名称:Data-Mining,代码行数:31,代码来源:Data_Preprocessing_Script.py


示例14: ngram

#---------------------------------------------------------------------------------------
#
#	Comment section below out if you already have made pickle files
#
#---------------------------------------------------------------------------------------

all_bigr = ngram(X_train, 'bigram') #starting with all features

print "Starting counting bigrams..."
X_train_bi_counted = count(X_train, all_bigr, 'bigram')
print "Done counting train set"
X_test_bi_counted = count(X_test, all_bigr, 'bigram')
print "Done counting test set"

print "Binarizing and dumping files"
bin = Binarizer()
X_train_bi_binary = bin.fit_transform(X_train_bi_counted)
X_test_bi_binary = bin.transform(X_test_bi_counted)
pickle.dump(X_train_bi_binary, open( "X_train_bi_binary.p", "wb" ) )
pickle.dump(X_test_bi_binary, open( "X_test_bi_binary.p", "wb" ) )
print "Done"


print "Starting tfidf vectors..."
X_train_bi_tfidf, X_test_bi_tfidf = tfidf(X_train_bi_counted, X_test_bi_counted)
pickle.dump(X_train_bi_tfidf, open( "X_train_bi_tfidf.p", "wb" ) )
pickle.dump(X_test_bi_tfidf, open( "X_test_bi_tfidf.p", "wb" ) )
print "Done"


print "Starting feature selection using CART random forests on binary files"
开发者ID:MariaBarrett,项目名称:LPIIExam,代码行数:31,代码来源:ngram.py


示例15: print

_, n_features = X.get_shape()

print('Loading test data...')
with open('data/test-svmlight.dat') as infile:
	lines = infile.readlines()
	n_samples = len(lines)
	test = lil_matrix((n_samples, n_features))
	for n,line in enumerate(lines):
		for word_count in line.split():
			fid, count = word_count.split(':')
			test[n,int(fid)] = int(fid)
test = test.tocsr()

if opts.binarize:
	print('Binarizing the data...')
	binar = Binarizer(copy=False)
	X = binar.transform(X)
	test = binar.transform(test)

if opts.tfidf:
	print('Transforming word occurrences into TF-IDF...')
	tranny = TfidfTransformer()
	X = tranny.fit_transform(X)
	test = tranny.transform(test)

if opts.select_features:
	k_features = int(opts.k_features)
	if opts.select_features == 'k-best':
		print('Selecting %i best features...' % k_features)
		ch2 = SelectKBest(chi2, k=k_features)
	if opts.select_features == 'pct':
开发者ID:Androidized,项目名称:BabysFirstTextClassifier,代码行数:31,代码来源:extract.py


示例16: load

def load(opt='custom', x_filename=None, y_filename=None, n_samples=0,
         samples_on='rows', **kwargs):
    """Load a specified dataset.

    This function can be used either to load one of the standard scikit-learn
    datasets or a different dataset saved as X.npy Y.npy in the working
    directory.

    Parameters
    -----------
    opt : {'iris', 'digits', 'diabetes', 'boston', 'circles', 'moons',
          'custom', 'GSEXXXXX'}, default: 'custom'
        Name of a predefined dataset to be loaded. 'iris', 'digits', 'diabetes'
        'boston', 'circles' and 'moons' refer to the correspondent
        `scikit-learn` datasets. 'custom' can be used to load a custom dataset
        which name is specified in `x_filename` and `y_filename` (optional).

    x_filename : string, default : None
        The data matrix file name.

    y_filename : string, default : None
        The label vector file name.

    n_samples : int
        The number of samples to be loaded. This comes handy when dealing with
        large datasets. When n_samples is less than the actual size of the
        dataset this function performs a random subsampling that is stratified
        w.r.t. the labels (if provided).

    samples_on : string
        This can be either in ['row', 'rows'] if the samples lie on the row of
        the input data matrix, or viceversa in ['col', 'cols'] the other way
        around.

    data_sep : string
        The data separator. For instance comma, tab, blank space, etc.

    Returns
    -----------
    X : array of float, shape : n_samples x n_features
        The input data matrix.

    y : array of float, shape : n_samples
        The label vector; np.nan if missing.

    feature_names : array of integers (or strings), shape : n_features
        The feature names; a range of number if missing.

    index : list of integers (or strings)
        This is the samples identifier, if provided as first column (or row) of
        of the input file. Otherwise it is just an incremental range of size
        n_samples.
    """
    data = None
    try:
        if opt.lower() == 'iris':
            data = datasets.load_iris()
        elif opt.lower() == 'digits':
            data = datasets.load_digits()
        elif opt.lower() == 'diabetes':
            data = datasets.load_diabetes()
            b = Binarizer(threshold=np.mean(data.target))
            data.target = b.fit_transform(data.data)
        elif opt.lower() == 'boston':
            data = datasets.load_boston()
            b = Binarizer(threshold=np.mean(data.target))
            data.target = b.fit_transform(data.data)
        elif opt.lower() == 'gauss':
            means = np.array([[-1, 1, 1, 1], [0, -1, 0, 0], [1, 1, -1, -1]])
            sigmas = np.array([0.33, 0.33, 0.33])
            if n_samples <= 1:
                n_samples = 333
            xx, yy = generate_gauss(mu=means, std=sigmas, n_sample=n_samples)
            data = datasets.base.Bunch(data=xx, target=yy)
        elif opt.lower() == 'circles':
            if n_samples == 0:
                n_samples = 400
            xx, yy = datasets.make_circles(n_samples=n_samples, factor=.3,
                                           noise=.05)
            data = datasets.base.Bunch(data=xx, target=yy)
        elif opt.lower() == 'moons':
            if n_samples == 0:
                n_samples = 400
            xx, yy = datasets.make_moons(n_samples=n_samples, noise=.01)
            data = datasets.base.Bunch(data=xx, target=yy)
        elif opt.lower() == 'custom':
            data = load_custom(x_filename, y_filename, samples_on, **kwargs)
        elif opt.lower().startswith('gse'):
            raise Exception("Use ade_GEO2csv.py to convert GEO DataSets"
                            "into csv files.")
    except IOError as e:
        print("I/O error({0}): {1}".format(e.errno, e.strerror))

    X, y = data.data, data.target
    if n_samples > 0 and X.shape[0] > n_samples:
        if y is not None:
            try:  # Legacy for sklearn
                sss = StratifiedShuffleSplit(y, test_size=n_samples, n_iter=1)
                # idx = np.random.permutation(X.shape[0])[:n_samples]
            except TypeError:
#.........这里部分代码省略.........
开发者ID:slipguru,项目名称:adenine,代码行数:101,代码来源:data_source.py


示例17: Binarizer

from Models import InteractionFeatures, Model, Bounder, RemoveDuplicateCols, ReturnSame, f1, lad

from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor

#%%

os.chdir(workspace)

logging.config.fileConfig('loggerConfig.properties')

logger = logging.getLogger('alllog')
logger.debug("Starting...")

binarizer = Binarizer(copy=True, threshold=thresh)

featureunion1 = FeatureUnion([
                              #('duplicater',ReturnSame()),
                              ('if+',InteractionFeatures(method = lambda x,y:(x+y), threshold = corr_thresh,subsample = 1,logger=logger)),
                              ('if-',InteractionFeatures(method = lambda x,y:(x-y), threshold = corr_thresh,subsample = 1,logger=logger)),
                              ('if*',InteractionFeatures(method = lambda x,y:(x*y), threshold = corr_thresh,subsample = 1,logger=logger)),
                              ('if/',InteractionFeatures(method = lambda x,y:(x/y), threshold = corr_thresh,subsample = 1,logger=logger)),
                              ('if|',InteractionFeatures(method = lambda x,y:(y/x), threshold = corr_thresh,subsample = 1,logger=logger))
                               ])
                             
pp_pipeline = Pipeline([
                        ('removedupes',RemoveDuplicateCols(logger=logger)),
                        ('featureextraction',featureunion1),
                        ('bounder',Bounder(inf,-inf))
                        ])
开发者ID:vpatanjali,项目名称:Python,代码行数:31,代码来源:model_refactored.py


示例18: test_binarizer

def test_binarizer():
    X_ = np.array([[1, 0, 5], [2, 3, -1]])

    for init in (np.array, list, sparse.csr_matrix, sparse.csc_matrix):

        X = init(X_.copy())

        binarizer = Binarizer(threshold=2.0, copy=True)
        X_bin = toarray(binarizer.transform(X))
        assert_equal(np.sum(X_bin == 0), 4)
        assert_equal(np.sum(X_bin == 1), 2)
        X_bin = binarizer.transform(X)
        assert_equal(sparse.issparse(X), sparse.issparse(X_bin))

        binarizer = Binarizer(copy=True).fit(X)
        X_bin = toarray(binarizer.transform(X))
        assert_true(X_bin is not X)
        assert_equal(np.sum(X_bin == 0), 2)
        assert_equal(np.sum(X_bin == 1), 4)

        binarizer = Binarizer(copy=True)
        X_bin = binarizer.transform(X)
        assert_true(X_bin is not X)
        X_bin = toarray(X_bin)
        assert_equal(np.sum(X_bin == 0), 2)
        assert_equal(np.sum(X_bin == 1), 4)

        binarizer = Binarizer(copy=False)
        X_bin = binarizer.transform(X)
        if init is not list:
            assert_true(X_bin is X)
        X_bin = toarray(X_bin)
        assert_equal(np.sum(X_bin == 0), 2)
        assert_equal(np.sum(X_bin == 1), 4)

    binarizer = Binarizer(threshold=-0.5, copy=True)
    for init in (np.array, list):
        X = init(X_.copy())

        X_bin = toarray(binarizer.transform(X))
        assert_equal(np.sum(X_bin == 0), 1)
        assert_equal(np.sum(X_bin == 1), 5)
        X_bin = binarizer.transform(X)

    # Cannot use threshold < 0 for sparse
    assert_raises(ValueError, binarizer.transform, sparse.csc_matrix(X))
开发者ID:abouaziz,项目名称:scikit-learn,代码行数:46,代码来源:test_preprocessing.py


示例19: Binarizer

           'LR': LogisticRegression,
           'LSVC' : LinearSVC,
           'SVC' : SVC
           }

#%%
    
os.chdir(workspace)

dev_idvs_all = numpy.nan_to_num(numpy.load(dev_filename + ".npy"))
val_idvs_all = numpy.nan_to_num(numpy.load(val_filename + ".npy"))

dev_dvs = numpy.nan_to_num(numpy.load(dev_filename + "_dvs.npy"))
val_dvs = numpy.nan_to_num(numpy.load(val_filename + "_dvs.npy"))

binarizer = Binarizer(copy=True, threshold=thresh)
imputer = Imputer(copy = False)

dev_dvs_binary = binarizer.transform(dev_dvs).reshape((dev_dvs.shape[0],))
val_dvs_binary = binarizer.transform(val_dvs).reshape((val_dvs.shape[0],))

"""
from statsmodels.regression import quantile_regression

dev_idvs2 = dev_idvs[:10000,:]
inds = [i for i in xrange(dev_idvs2.shape[1]) if len(unique(dev_idvs2[:,i])) > 1]
dev_dvs2 = dev_dvs[:10000,:].reshape((10000,))

model = quantile_regression.QuantReg(dev_dvs2, dev_idvs2)
model.fit()
"""
开发者ID:vpatanjali,项目名称:Python,代码行数:31,代码来源:model.py


示例20: Binarizer

class Binarizer(TransformerMixin):
    """
    Реализует различные стратегии бинаризации признаков,
    вычисляя оптимальные пороги и производя бинаризацию с данными порогами

    Аргументы:
    ----------
    method: str('random', 'log_odds' or 'bns'), метод бинаризации признаков
    divide_to_bins: bool(optional, default=True),
        индикатор приведения количественных признаков к целочисленным
    bins_number: int(optional, default=10),
        число возможных значений целочисленных признаков при бинаризации
    """
    _UNSUPERVISED_METHODS = ['random']
    _SUPERVISED_METHODS = ['log_odds', 'bns']
    _CONTINGENCY_METHODS = ['log_odds', 'bns']

    def __init__(self, method, divide_to_bins=True, bins_number=10):
        self.method = method
        self.divide_to_bins = divide_to_bins
        self.bins_number = bins_number

    def fit(self, X, y=None):
        """
        Обучает бинаризатор на данных
        """
        # print("Fitting binarizer...")
        methods = Binarizer._UNSUPERVISED_METHODS + Binarizer._SUPERVISED_METHODS
        if self.method not in methods:
            raise ValueError("Method should be one of {0}".format(", ".join(methods)))
        X = check_array(X, accept_sparse=['csr', 'csc'])
        if issparse(X):
            X = X.tocsc()
        if self.method in Binarizer._UNSUPERVISED_METHODS:
            self._fit_unsupervised(X)
            self.joint_thresholds_ = self.thresholds_
            self.joint_scores_ = self.scores_
        else:
            if y is None:
                raise ValueError("y must not be None for supervised binarizers.")
            # вынести в отдельную функцию
            # y = np.array(y)
            # if len(y.shape) == 1:
            #     self.classes_, y = np.unique(y, return_inverse=True)
            #     nclasses = self.classes_.shape[0]
            #     Y_new = np.zeros(shape=(y.shape[0], nclasses), dtype=int)
            #     Y_new[np.arange(y.shape[0]), y] = 1
            # else:
            #     self.classes_ = np.arange(y.shape[1])
            #     Y_new = y
            label_binarizer = SK_LabelBinarizer()
            Y_new = label_binarizer.fit_transform(y)
            self.classes_ = label_binarizer.classes_
            if X.shape[0] != Y_new.shape[0]:
                raise ValueError("X and y have incompatible shapes.\n"
                                 "X has %s samples, but y has %s." %
                                 (X.shape[0], Y_new.shape[0]))
            self._fit_supervised(X, Y_new)
            if len(self.classes_) <= 2:
                self.joint_thresholds_ = self.thresholds_[:, 0]
                self.joint_scores_ = self.scores_[:, 0]
            else:
                min_class_scores = np.min(self.scores_, axis=0)
                max_class_scores = np.max(self.scores_, axis=0)
                diffs = max_class_scores - min_class_scores
                diffs[np.where(diffs == 0)] = 1
                normalized_scores = (self.scores_ - min_class_scores) / diffs
                # находим для каждого признака тот класс, для которого он наиболее полезен
                # НАВЕРНО, МОЖНО СДЕЛАТЬ ПО_ДРУГОМУ
                optimal_indexes = np.argmax(normalized_scores, axis=1)
                nfeat = self.thresholds_.shape[0]
                # в качестве порога бинаризации каждого признака
                # берём значение для класса, где он наиболее полезен
                self.joint_thresholds_ = self.thresholds_[np.arange(nfeat), optimal_indexes]
                self.joint_scores_ = self.scores_[np.arange(nfeat), optimal_indexes]
        # передаём пороги в sklearn.SK_Binarizer
        self.binarize_transformer_ = SK_Binarizer(self.joint_thresholds_)
        return self

    def transform(self, X):
        """
        Применяем бинаризатор к данным
        """
        print("Transforming binarizer...")
        if hasattr(self, 'binarize_transformer_'):
            return self.binarize_transformer_.transform(X)
        else:
            raise ValueError("Transformer is not fitted")

    def _fit_unsupervised(self, X):
        """
        Управляющая функция для методов подбора порога без учителя
        """
        if self.method == 'random':
            # случайные пороги и полезности
            if issparse(X):
                minimums = X.min(axis=0).toarray()
                maximums = X.max(axis=0).toarray()
            else:
                minimums = np.min(X, axis=0)
#.........这里部分代码省略.........
开发者ID:AlexeySorokin,项目名称:pyparadigm,代码行数:101,代码来源:feature_selector.py



注:本文中的sklearn.preprocessing.Binarizer类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python preprocessing.FunctionTransformer类代码示例发布时间:2022-05-27
下一篇:
Python preprocessing.scale函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap