首页 文章

快速信息增益计算

提问于
浏览
9

我需要在> 10k文档中为 text classification 计算> 100k特征的 Information Gain 分数 . 下面的代码工作正常,但 for the full dataset is very slow - 笔记本电脑上需要一个多小时 . 数据集是20newsgroup,我使用的是scikit-learn, chi2 函数,它在scikit中提供的工作非常快 .

知道如何为这样的数据集更快地计算信息增益吗?

def information_gain(x, y):

    def _entropy(values):
        counts = np.bincount(values)
        probs = counts[np.nonzero(counts)] / float(len(values))
        return - np.sum(probs * np.log(probs))

    def _information_gain(feature, y):
        feature_set_indices = np.nonzero(feature)[1]
        feature_not_set_indices = [i for i in feature_range if i not in feature_set_indices]
        entropy_x_set = _entropy(y[feature_set_indices])
        entropy_x_not_set = _entropy(y[feature_not_set_indices])

        return entropy_before - (((len(feature_set_indices) / float(feature_size)) * entropy_x_set)
                                 + ((len(feature_not_set_indices) / float(feature_size)) * entropy_x_not_set))

    feature_size = x.shape[0]
    feature_range = range(0, feature_size)
    entropy_before = _entropy(y)
    information_gain_scores = []

    for feature in x.T:
        information_gain_scores.append(_information_gain(feature, y))
    return information_gain_scores, []

EDIT:

我合并了内部函数并运行 cProfiler ,如下所示(在数据集上限制为~15k特征和~1k文档):

cProfile.runctx(
    """for feature in x.T:
    feature_set_indices = np.nonzero(feature)[1]
    feature_not_set_indices = [i for i in feature_range if i not in feature_set_indices]

    values = y[feature_set_indices]
    counts = np.bincount(values)
    probs = counts[np.nonzero(counts)] / float(len(values))
    entropy_x_set = - np.sum(probs * np.log(probs))

    values = y[feature_not_set_indices]
    counts = np.bincount(values)
    probs = counts[np.nonzero(counts)] / float(len(values))
    entropy_x_not_set = - np.sum(probs * np.log(probs))

    result = entropy_before - (((len(feature_set_indices) / float(feature_size)) * entropy_x_set)
                             + ((len(feature_not_set_indices) / float(feature_size)) * entropy_x_not_set))
    information_gain_scores.append(result)""",
    globals(), locals())

结果排名前20位 tottime

ncalls  tottime percall cumtime percall filename:lineno(function)
1       60.27   60.27   65.48   65.48   <string>:1(<module>)
16171   1.362   0   2.801   0   csr.py:313(_get_row_slice)
16171   0.523   0   0.892   0   coo.py:201(_check)
16173   0.394   0   0.89    0   compressed.py:101(check_format)
210235  0.297   0   0.297   0   {numpy.core.multiarray.array}
16173   0.287   0   0.331   0   compressed.py:631(prune)
16171   0.197   0   1.529   0   compressed.py:534(tocoo)
16173   0.165   0   1.263   0   compressed.py:20(__init__)
16171   0.139   0   1.669   0   base.py:415(nonzero)
16171   0.124   0   1.201   0   coo.py:111(__init__)
32342   0.123   0   0.123   0   {method 'max' of 'numpy.ndarray' objects}
48513   0.117   0   0.218   0   sputils.py:93(isintlike)
32342   0.114   0   0.114   0   {method 'sum' of 'numpy.ndarray' objects}
16171   0.106   0   3.081   0   csr.py:186(__getitem__)
32342   0.105   0   0.105   0   {numpy.lib._compiled_base.bincount}
32344   0.09    0   0.094   0   base.py:59(set_shape)
210227  0.088   0   0.088   0   {isinstance}
48513   0.081   0   1.777   0   fromnumeric.py:1129(nonzero)
32342   0.078   0   0.078   0   {method 'min' of 'numpy.ndarray' objects}
97032   0.066   0   0.153   0   numeric.py:167(asarray)

看起来大部分时间花在 _get_row_slice 上 . 我不完全确定第一行,看起来它涵盖了我提供给 cProfile.runctx 的整个块,虽然我不知道为什么第一行 totime=60.27 和第二行 tottime=1.362 之间存在这么大的差距 . 差异花在哪里?是否可以在 cProfile 中查看?

基本上,看起来问题是稀疏矩阵运算(切片,获取元素) - 解决方案可能是使用矩阵代数(如 chi2 is implemented in scikit )计算 Information Gain . 但是我不知道如何用矩阵运算来表达这个计算......任何人都有一个想法?

3 回答

  • 0

    Don 't know whether it still helps since a year has passed. But now I happen to be faced with the same task for text classification. I'使用为稀疏矩阵提供的nonzero()函数重写了您的代码 . 然后我只扫描nz,计算相应的y_value并计算熵 .

    以下代码仅需要几秒钟来运行news20数据集(使用libsvm稀疏矩阵格式加载) .

    def information_gain(X, y):
    
        def _calIg():
            entropy_x_set = 0
            entropy_x_not_set = 0
            for c in classCnt:
                probs = classCnt[c] / float(featureTot)
                entropy_x_set = entropy_x_set - probs * np.log(probs)
                probs = (classTotCnt[c] - classCnt[c]) / float(tot - featureTot)
                entropy_x_not_set = entropy_x_not_set - probs * np.log(probs)
            for c in classTotCnt:
                if c not in classCnt:
                    probs = classTotCnt[c] / float(tot - featureTot)
                    entropy_x_not_set = entropy_x_not_set - probs * np.log(probs)
            return entropy_before - ((featureTot / float(tot)) * entropy_x_set
                                 +  ((tot - featureTot) / float(tot)) * entropy_x_not_set)
    
        tot = X.shape[0]
        classTotCnt = {}
        entropy_before = 0
        for i in y:
            if i not in classTotCnt:
                classTotCnt[i] = 1
            else:
                classTotCnt[i] = classTotCnt[i] + 1
        for c in classTotCnt:
            probs = classTotCnt[c] / float(tot)
            entropy_before = entropy_before - probs * np.log(probs)
    
        nz = X.T.nonzero()
        pre = 0
        classCnt = {}
        featureTot = 0
        information_gain = []
        for i in range(0, len(nz[0])):
            if (i != 0 and nz[0][i] != pre):
                for notappear in range(pre+1, nz[0][i]):
                    information_gain.append(0)
                ig = _calIg()
                information_gain.append(ig)
                pre = nz[0][i]
                classCnt = {}
                featureTot = 0
            featureTot = featureTot + 1
            yclass = y[nz[1][i]]
            if yclass not in classCnt:
                classCnt[yclass] = 1
            else:
                classCnt[yclass] = classCnt[yclass] + 1
        ig = _calIg()
        information_gain.append(ig)
    
        return np.asarray(information_gain)
    
  • 0

    这是一个使用矩阵运算的版本 . 特征的IG是其特定于类的分数的平均值 .

    import numpy as np
    from scipy.sparse import issparse
    from sklearn.preprocessing import LabelBinarizer
    from sklearn.utils import check_array
    from sklearn.utils.extmath import safe_sparse_dot
    
    
    def ig(X, y):
    
        def get_t1(fc, c, f):
            t = np.log2(fc/(c * f))
            t[~np.isfinite(t)] = 0
            return np.multiply(fc, t)
    
        def get_t2(fc, c, f):
            t = np.log2((1-f-c+fc)/((1-c)*(1-f)))
            t[~np.isfinite(t)] = 0
            return np.multiply((1-f-c+fc), t)
    
        def get_t3(c, f, class_count, observed, total):
            nfc = (class_count - observed)/total
            t = np.log2(nfc/(c*(1-f)))
            t[~np.isfinite(t)] = 0
            return np.multiply(nfc, t)
    
        def get_t4(c, f, feature_count, observed, total):
            fnc = (feature_count - observed)/total
            t = np.log2(fnc/((1-c)*f))
            t[~np.isfinite(t)] = 0
            return np.multiply(fnc, t)
    
        X = check_array(X, accept_sparse='csr')
        if np.any((X.data if issparse(X) else X) < 0):
            raise ValueError("Input X must be non-negative.")
    
        Y = LabelBinarizer().fit_transform(y)
        if Y.shape[1] == 1:
            Y = np.append(1 - Y, Y, axis=1)
    
        # counts
    
        observed = safe_sparse_dot(Y.T, X)          # n_classes * n_features
        total = observed.sum(axis=0).reshape(1, -1).sum()
        feature_count = X.sum(axis=0).reshape(1, -1)
        class_count = (X.sum(axis=1).reshape(1, -1) * Y).T
    
        # probs
    
        f = feature_count / feature_count.sum()
        c = class_count / float(class_count.sum())
        fc = observed / total
    
        # the feature score is averaged over classes
        scores = (get_t1(fc, c, f) +
                get_t2(fc, c, f) +
                get_t3(c, f, class_count, observed, total) +
                get_t4(c, f, feature_count, observed, total)).mean(axis=0)
    
        scores = np.asarray(scores).reshape(-1)
    
        return scores, []
    

    在具有1000个实例和1000个唯一特征的数据集上,此实现比没有矩阵操作的实现快100倍 .

  • 8

    这是代码 feature_not_set_indices = [i for i in feature_range if i not in feature_set_indices] 占用90%的时间,尝试更改设置操作

相关问题