首页 文章

适合(三重)高斯到数据python

提问于
浏览
3

我的问题的简短版本如下:我有一些数据(行星密度)的直方图,似乎有3个偷看 . 现在我想让3位高斯人适应这个直方图 .

我期待这个结果 .

我使用了不同的方法来适应我的高斯:来自sklearn.mixture的curve_fit,least square和GaussianMixture . 使用Curve_fit,我非常适合

但如果将它与我的预期结果进行比较,那就不够好了 . 至少方格我得到了“合适”


但是我的高斯是无稽之谈,而且对于GaussianMixture我真的很擅长我在示例中看到的代码来解决我的问题 .

At this point I have three questions:

  • 最重要的是:我怎样才能更好地适应我的第三高斯?我已经尝试调整p0的初始值,但随后高斯变得更差或根本找不到参数 .

  • 我的最小二乘代码出了什么问题?为什么它会给我这样奇怪的高斯人?有没有办法解决这个问题?我的猜测:是不是因为最小二乘法可以最大限度地减少拟合和实际数据之间的误差?

  • 如何使用GaussianMixture完成整个事情?我找到了这篇文章

但无法适应我的问题 .

我真的很想了解如何正确配合,因为我将来必须做很多事情 . 问题是我在统计方面不是很好,只是开始用python编程 .

这是我的三个不同代码:

Curvefit

import numpy as np
import math
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit

hist, bin_edges = np.histogram(Density, bins=np.logspace(np.log10(MIN), np.log10(MAX), 32))

bin_centres = (bin_edges[:-1] + bin_edges[1:])/2

# Define model function to be used to fit to the data above:
def triple_gaussian(  x,*p ):
    (c1, mu1, sigma1, c2, mu2, sigma2, c3, mu3, sigma3) = p
    res =    np.divide(1,x)*c1 * np.exp( - (np.log(x) - mu1)**2.0 / (2.0 * sigma1**2.0) ) \
          +  np.divide(1,x)*c2 * np.exp( - (np.log(x) - mu2)**2.0 / (2.0 * sigma2**2.0) ) \
          +  np.divide(1,x)*c3 * np.exp( - (np.log(x) - mu3)**2.0 / (2.0 * sigma3**2.0) )
    return res

# p0 is the initial guess for the fitting coefficients (A, mu and sigma above)
p0 = [60., 1, 1., 30., 1., 1.,10., 1., 1]

coeff, var_matrix = curve_fit(triple_gaussian, bin_centres, hist, p0=p0)

# Get the fitted curve
hist_fit = triple_gaussian(bin_centres, *coeff)

c1 =coeff[0]
mu1 =coeff[1]
sigma1 =coeff[2]
c2 =coeff[3]
mu2 =coeff[4]
sigma2 =coeff[5]
c3 =coeff[6]
mu3 =coeff[7]
sigma3 =coeff[8]
x= bin_centres

gauss1= np.divide(1,x)*c1 * np.exp( - (np.log(x) - mu1)**2.0 / (2.0 * sigma1**2.0) )
gauss2= np.divide(1,x)*c2 * np.exp( - (np.log(x) - mu2)**2.0 / (2.0 * sigma2**2.0) )
gauss3= np.divide(1,x)*c3 * np.exp( - (np.log(x) - mu3)**2.0 / (2.0 * sigma3**2.0) )

plt.plot(x,gauss1, 'g',label='gauss1')
plt.plot(x,gauss2, 'b', label='gauss2')
plt.plot(x,gauss3, 'y', label='gauss3')
plt.gca().set_xscale("log")
plt.legend(loc='upper right')
plt.ylim([0,70])
plt.suptitle('Triple log Gaussian fit over all Data', fontsize=20)
plt.xlabel('log(Density)')
plt.ylabel('Number')

plt.hist(Density, bins=np.logspace(np.log10(MIN), np.log10(MAX), 32), label='all Densities')
plt.plot(bin_centres, hist, label='Test data')
plt.plot(bin_centres, hist_fit, label='Fitted data')
plt.gca().set_xscale("log") 
plt.ylim([0,70])
plt.suptitle('triple log Gaussian fit using curve_fit', fontsize=20)
plt.xlabel('log(Density)')
plt.ylabel('Number')
plt.legend(loc='upper right')
plt.annotate(Text1, xy=(0.01, 0.95), xycoords='axes fraction')
plt.annotate(Text2, xy=(0.01, 0.90), xycoords='axes fraction')
plt.savefig('all Densities_gauss')
plt.show()

Leastsquare

适合itselfe看起来并不坏,但3高斯是可怕的 . 看这里

# I only have x-data, so to get according y-data I make my histogram and
 #use the bins as x-data and the numbers (hist) as y-data. 
#Density is a Dataset of 581 Values between 0 and 340.

hist, bin_edges = np.histogram(Density, bins=np.logspace(np.log10(MIN), np.log10(MAX), 32))
x = (bin_edges[:-1] + bin_edges[1:])/2
y = hist

#define tripple gaussian

def triple_gaussian(  p,x ):
    (c1, mu1, sigma1, c2, mu2, sigma2, c3, mu3, sigma3) = p
    res =    np.divide(1,x)*c1 * np.exp( - (np.log(x) - mu1)**2.0 / (2.0 * sigma1**2.0) ) \
          +  np.divide(1,x)*c2 * np.exp( - (np.log(x) - mu2)**2.0 / (2.0 * sigma2**2.0) ) \
          +  np.divide(1,x)*c3 * np.exp( - (np.log(x) - mu3)**2.0 / (2.0 * sigma3**2.0) )
    return res

def errfunc(p,x,y):
   return y-triple_gaussian(p,x)

p0=[]
p0 = [60., 0.1, 1., 30., 1., 1.,10., 10., 1.]
fit = optimize.leastsq(errfunc,p0,args=(x,y))

print('fit', fit)



plt.plot(x,y)
plt.plot(x,triple_gaussian(fit[0],x), 'r')
plt.gca().set_xscale("log")
plt.ylim([0,70])
plt.suptitle('Double log Gaussian fit over all Data', fontsize=20)
plt.xlabel('log(Density)')
plt.ylabel('Number')

c1, mu1, sigma1, c2, mu2, sigma2, c3, mu3, sigma3=fit[0]

print('c1', c1)

gauss1= np.divide(1,x)*c1 * np.exp( - (np.log(x) - mu1)**2.0 / (2.0 * sigma1**2.0) )
gauss2= np.divide(1,x)*c2 * np.exp( - (np.log(x) - mu2)**2.0 / (2.0 * sigma2**2.0) )
gauss3= np.divide(1,x)*c3 * np.exp( - (np.log(x) - mu3)**2.0 / (2.0 * sigma3**2.0) )

plt.plot(x,gauss1, 'g')
plt.plot(x,gauss2, 'b')
plt.plot(x,gauss3, 'y')
plt.gca().set_xscale("log")
plt.ylim([0,70])
plt.suptitle('Double log Gaussian fit over all Data', fontsize=20)
plt.xlabel('log(Density)')
plt.ylabel('Number')

GaussianMixture

正如我所说,我不太了解GaussianMixture . 我不知道我是否必须像之前那样定义一个三重奏,或者它是否足以定义高斯,而GaussianMixture将发现它本身就有一个三重高斯 . 我也不明白我在哪里使用哪些数据,因为当我使用bin和hist值时,“拟合曲线”只是相互连接的数据点 . 所以我认为我使用了错误的数据 .

我不理解的部分是#Fit GMM和#Construct函数手动作为高斯的总和 .

hist, bin_edges = np.histogram(Density, bins=np.logspace(np.log10(MIN), np.log10(MAX), 32))

bin_centres = (bin_edges[:-1] + bin_edges[1:])/2

plt.hist(Density, bins=np.logspace(np.log10(MIN), np.log10(MAX), 32), label='all Densities')
plt.gca().set_xscale("log") 
plt.ylim([0,70])

# Define simple gaussian
def gauss_function(x, amp, x0, sigma):
    return np.divide(1,x)*amp * np.exp(-(np.log(x) - x0) ** 2. / (2. * sigma ** 2.))

# My Data
samples = Density

# Fit GMM
gmm = GaussianMixture(n_components=3, covariance_type="full", tol=0.00001)
gmm = gmm.fit(X=np.expand_dims(samples, 1))

gmm_x= bin_centres
gmm_y= hist
# Construct function manually as sum of gaussians
gmm_y_sum = np.full_like(gmm_x, fill_value=0, dtype=np.float32)
for m, c, w in zip(gmm.means_.ravel(), gmm.covariances_.ravel(), gmm.weights_.ravel()):
    gauss = gauss_function(x=gmm_x, amp=1, x0=m, sigma=np.sqrt(c))
    gmm_y_sum += gauss / np.trapz(gauss, gmm_x) *w 

# Make regular histogram
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=[8, 5])
ax.hist(samples, bins=np.logspace(np.log10(MIN), np.log10(MAX), 32), label='all Densities')
ax.plot(gmm_x, gmm_y, color="crimson", lw=4, label="GMM")
ax.plot(gmm_x, gmm_y_sum, color="black", lw=4, label="Gauss_sum", linestyle="dashed")
plt.gca().set_xscale("log") 
plt.ylim([0,70])

# Annotate diagram
ax.set_ylabel("Probability density")
ax.set_xlabel("Arbitrary units")

# Make legend
plt.legend()

plt.show()

我希望任何人都能帮助我至少解决我的一个问题 . 正如我所说,如果遗漏任何内容或者如果您需要更多信息,请告诉我 .

提前致谢!

  • 编辑 - Here是我的数据 .

2 回答

  • 1

    拥有实际数据的链接会很有帮助,但我可以在没有数据的情况下提出一些建议 .

    首先,将 x 转换为 np.log(x) 非常简单,可能值得付出努力 .

    其次,高斯的定义通常不包括 1./x - 它可能是一个小的效果,但你的 x 的值正在改变一个数量级,所以可能不是 .

    第三,你为所有三个高斯人提供 mu 的相同起始值 . 这使得适合更加困难 . 尝试给出更接近实际预期值的起点,如果可能的话,限制这些值 .

    为了帮助解决这些问题,您可能会发现lmfit(https://lmfit.github.io/lmfit-py/)很有帮助 . 它肯定会使你的脚本更短,也许就像

    import numpy as np
    import matplotlib.pyplot as plt
    from lmfit.models import GaussianModel
    
    y, bin_edges = np.histogram(Density, bins=np.logspace(np.log10(MIN), np.log10(MAX), 32))
    
    x = np.log((bin_edges[:-1] + bin_edges[1:])/2.0) #take log here
    
    # build a model as a sum of 3 Gaussians
    model = (GaussianModel(prefix='g1_') + GaussianModel(prefix='g2_') + 
             GaussianModel(prefix='g3_'))
    
    # build Parameters with initial values
    params = model.make_params(g1_amplitude=60, g1_center=-1.0, g1_sigma=1,
                               g2_amplitude=30, g2_center= 0.0, g1_sigma=1,
                               g2_amplitude=10, g2_center= 1.0, g1_sigma=1)
    
    # optionally, set bound / constraints on Parameters:
    params['g1_center'].max = 0
    
    params['g2_center'].min = -1.0
    params['g2_center'].max = 1.0
    
    params['g3_center'].min = 0
    
    # perform the actual fit
    result = model.fit(y, params, x=x)
    
    # print fit statistics and values and uncertainties for variables
    print(result.fit_report())
    
    # evaluate the model components ('g1_', 'g2_', and 'g3_')
    comps = result.eval_components(result.params, x=x)
    
    # plot the results
    plt.plot(x, y, label='data')
    plt.plot(x, result.best_fit, label='best fit')
    
    plt.plot(x, comps['g1_'], label='gaussian1')
    plt.plot(x, comps['g2_'], label='gaussian2')
    plt.plot(x, comps['g3_'], label='gaussian3')
    # other plt methods for axes and labels
    plt.show()
    

    如果你的模型真的需要 (1/x) 倍高斯,或者你需要一个不同的函数形式 . 您可以使用内置的LognormalModel,其他内置模型之一,或者轻松编写自己的模型函数并将其包装起来 .

    希望有所帮助 .

  • 0

    对于你的具体情况,总结三个Gaussian或mixed model之间没有区别,后者只关注保持规范 . 基本上,我只是简化并清理了你的版本 . 它运行良好,但要注意结果取决于箱的数量非常显着 .

    import matplotlib.pyplot as plt
    import numpy as np
    import scipy.optimize as so
    
    data = np.loadtxt( "data.txt" )
    myBins = np.logspace( np.log10( min( data ) ), np.log10( max( data ) ), 35 )
    
    """ as we are logarithmic I calculate the bin 'centre' logarithmic as well """
    xBins = np.fromiter( ( ( 10**( np.log10( x * y ) / 2. ) ) for x,y in zip( myBins[:-1], myBins[1:] ) ), np.float ) 
    vals, bins = np.histogram( data, myBins )
    
    def chunks( l, n ):
        """Yield successive n-sized chunks from l."""
        for i in range( 0, len( l ), n ):
            yield l[ i:i + n ]
    
    
    """  I use a simplified version without the 1/x """
    def my_gauss( x, c, mu, sig ):
        #~ out = c * np.exp( - ( np.log( x ) - mu )**2.0 / (2.0 * sig**2.0 ) ) * np.divide( 1, x )
        out = c * np.exp( - ( np.log( x ) - mu )**2.0 / (2.0 * sig**2.0 ) )
        return out
    
    
    def triple_residuals( params, xData, yData ):
        yTh = np.zeros_like( yData, dtype=np.float )
        for params in chunks( params, 3 ) :
            yTh += np.fromiter( ( my_gauss( x, *params ) for x in xData ), np.float )
        diff = yData - yTh
        return diff
    
    
    sol, err = so.leastsq( triple_residuals, [ 40, -2.1, 1.1, 10, -0.1, 1.1, 10, 2.1, 1.1 ], args=( xBins, vals )  )
    
    
    myxList = np.logspace( np.log10( min( data ) ), np.log10( max( data ) ), 150 )
    
    """ for guessing start values """
    #~ myg1List = np.fromiter( ( my_gauss( x, 40, -2.1, 1.1 ) for x in myxList ), np.float )
    #~ myg2List = np.fromiter( ( my_gauss( x, 20, -0.1, 1.2 ) for x in myxList ), np.float )
    #~ myg3List = np.fromiter( ( my_gauss( x, 10, 2.1, 1.3 ) for x in myxList ), np.float )
    
    
    fig = plt.figure()
    ax = fig.add_subplot( 1, 1, 1)
    ax.plot( bins[:-1], vals )
    
    """ for plotting start values """
    #~ ax.plot( myxList,  myg1List )
    #~ ax.plot( myxList,  myg2List )
    #~ ax.plot( myxList,  myg3List )
    
    gs = dict()
    for i,params in enumerate( chunks( sol, 3) ) :
        print params
        gs[i] = np.fromiter( ( my_gauss( x, *params ) for x in myxList ), np.float )
        ax.plot( myxList,  gs[i], ls='--' )
    
    gsAll = gs[0] + gs[1] + gs[2]
    ax.plot( myxList,  gsAll, lw=3 )
    
    ax.set_xscale('log')
    plt.show()
    

    并提供:

    >>[58.91221784 -2.1544611   0.89842033]
    >>[21.29816862  0.13135854  0.80339236]
    >>[5.44419833 2.42596666 0.85324204]
    

    fitted data

相关问题