首页 文章

得到错误:输入包含NaN,无穷大或dtype值太大('float64')

提问于
浏览
1
## Load the data ##

train=pd.read_csv("../kagglehouse/train.csv")
test=pd.read_csv("../kagglehouse/test.csv")
all_data=pd.concat((train.loc[:,"MSSubClass":"SaleCondition"],test.loc[:,"MSSubClass":"SaleCondition"]))

NFOLDS = 5
SEED = 0
NROWS = None

ntrain = train.shape[0]
ntest = test.shape[0]

#creating matrices for sklearn 1:
y_train=train["SalePrice"]
x_train = np.array(all_data[:train.shape[0]])
x_test = np.array(all_data[train.shape[0]:])

kf = KFold(ntrain, n_folds=NFOLDS, shuffle=True, random_state=SEED)

class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)


    def train(self, x_train, y_train):
        self.clf.fit(train_df_munged, label_df)
        #self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)

def get_oof(clf):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):

        x_tr = x_train[train_index]

        y_tr = y_train[train_index]

        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)


et_params = {
    'n_jobs': 16,
}

rf_params = {
    'n_jobs': 16,

}

xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,

}



rd_params={
    'alpha': 10
}


ls_params={
    'alpha': 0.005
}


et = SklearnWrapper(clf=ExtraTreesRegressor, seed=SEED, params=et_params)
rf = SklearnWrapper(clf=RandomForestRegressor, seed=SEED, params=rf_params)
rd = SklearnWrapper(clf=Ridge, seed=SEED, params=rd_params)
ls = SklearnWrapper(clf=Lasso, seed=SEED, params=ls_params)

et_oof_train, et_oof_test = get_oof(et)
rf_oof_train, rf_oof_test = get_oof(rf)
rd_oof_train, rd_oof_test = get_oof(rd)
ls_oof_train, ls_oof_test = get_oof(ls)

它看起来像这样

ValueError                                Traceback (most recent call
       last)
            in ()
               135 
               136 xg_oof_train, xg_oof_test = get_oof(xg)
           --> 137 et_oof_train, et_oof_test = get_oof(et)
               138 rf_oof_train, rf_oof_test = get_oof(rf)
               139 rd_oof_train, rd_oof_test = get_oof(rd)

            in get_oof(clf)
                77         x_te = x_train[test_index]
                78 
           ---> 79         clf.train(x_tr, y_tr)
                80 
                81         oof_train[test_index] = clf.predict(x_te)

            in train(self, x_train, y_train)
                46     def train(self, x_train, y_train):
                47         #self.clf.fit(x_train, y_train)
           ---> 48          self.clf.fit(x_train, y_train)
                49 
                50     def predict(self, x):

           E:\graphLab\Anaconda2\lib\site-packages\sklearn\ensemble\forest.pyc
       in fit(self, X, y, sample_weight)
               245         # Validate or convert input data
               246         X = check_array(X, accept_sparse="csc", dtype=DTYPE)
           --> 247         y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None)
               248         if sample_weight is not None:
               249             sample_weight = check_array(sample_weight, ensure_2d=False)

           E:\graphLab\Anaconda2\lib\site-packages\sklearn\utils\validation.pyc
       in check_array(array, accept_sparse, dtype, order, copy,
       force_all_finite, ensure_2d, allow_nd, ensure_min_samples,
       ensure_min_features, warn_on_dtype, estimator)
               420                              % (array.ndim, estimator_name))
               421         if force_all_finite:
           --> 422             _assert_all_finite(array)
               423 
               424     shape_repr = _shape_repr(array.shape)

           E:\graphLab\Anaconda2\lib\site-packages\sklearn\utils\validation.pyc
       in _assert_all_finite(X)
                41             and not np.isfinite(X).all()):
                42         raise ValueError("Input contains NaN, infinity"
           ---> 43                          " or a value too large for %r." % X.dtype)
                44 
                45 

           ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

当我使用 np.isnan(all_data.all()) 时,它返回False和 np.isfinite(all_data.all()) ,它返回True,所以我很困惑 . 为什么我收到此错误?

1 回答

  • 3

    您没有正确检查 all_data

    np.isnan( all_data.all() )
    np.isfinite( all_data.all() )
    

    not 你应该如何检查你的数据 .

    您正在将 np.isnan()np.isfinite() 应用于 all_data.all() 的输出,即 always 布尔值 True / False ,因此它是 always finite和非 nan .

    您应该将您的数据检查为:

    np.isfinite( all_data ).all()
    np.isnan( all_data ).all()
    

    请注意 all() 应用于 np.isfinite() np.isfinite()np.isnan() ,而不是相反 .

相关问题