首页 文章

使用python beautifulsoup进行Web解析会产生不一致的结果

提问于
浏览
1

我试图解析this site的表 . 我正在使用蟒蛇美丽的汤来做到这一点 . 虽然它在我朋友的Windows机器上产生错误的输出's producing correct output in my Ubuntu 14.04 machine, it' . 我在这里粘贴代码片段:

from bs4 import BeautifulSoup

def buildURL(agi, families):
    #agi and families contains space seperated string of genes and families
    genes = agi.split(" ")
    families = families.split(" ")
    base_url = "http://www.athamap.de/search_gene.php"

    url = base_url

    if len(genes):
        url = url + "?agi="
        for i, gene in enumerate(genes):
            if i>0:
                url = url + "%0D%0A"
            url = url + gene

    url = url + "&upstream=-500&downstream=50&restriction=0&sortBy1=gen&sortBy2=fac&sortBy3=pos"

    for family in families:
        family = family.replace("/", "%2F")
        url = url +"&familySelected%5B"+family+"%5D=on"
    url = url + "&formSubmitted=TRUE"
    return url

def fetch_html(agi, families):

    url = buildURL(agi, families)
    response = requests.get(url)

    soup = BeautifulSoup(str(response.text), "lxml")

    divs = soup.find_all('div')

    seldiv = ""
    for div in divs:
        try:
            if div["id"] == "geneAnalysisDetail":
                '''
                    This div contains interesting data
                '''
                seldiv = div
        except:
            None

    return seldiv

def parse(seldiv):
    soup = seldiv
    rows= soup.find_all('tr')

    attributes =["Gene", "Factor", "Family", "Position", "Relative orientation", "Relative Distance", "Max score", "Threshold Score", "Score"]

    print attributes
    save_rows = []
    for i in range(2, len(rows)):
        cols = rows[i].find_all('td')
        lst = []
        for j,col in enumerate(cols):
            if j==0:
                lst.append(re.sub('', '',str(col.contents[1].contents[0])))
            elif j==1:
                lst.append(str(col.contents[1].contents[0]))
            elif j==2:
                lst.append(str(col.contents[0]))
            elif j==3:
                lst.append(str(col.contents[1].contents[0]))
            else:
                lst.append(str(col.contents[0]))
        save_rows.append(lst)
    return save_rows

知道这里可能出现什么问题吗?我试过有没有lxml .

提前致谢 .

2 回答

  • 0

    一种可能是您没有为请求添加用户代理 . 不同的用户代理有时会得到不同的结果,尤其是来自怪异的网站 . 这是所有可能的代理商的列表,只需选择一个 . 它不一定是你的机器

    USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/602.4.8 (KHTML, like Gecko) Version/10.0.3 Safari/602.4.8',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
    'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0',
    'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0',
    'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/602.4.8 (KHTML, like Gecko) Version/10.0.3 Safari/602.4.8',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:52.0) Gecko/20100101 Firefox/52.0',
    'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:51.0) Gecko/20100101 Firefox/51.0',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:51.0) Gecko/20100101 Firefox/51.0',
    'Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0'
    ]
    
  • 0

    你可以用这种方式解析表格,并且应该在两台机器上都能很好地工作 . buildURL 函数应保持不变 .

    import requests
    from bs4 import BeautifulSoup
    
    def fetch_html(url):
    
        response = requests.get(url)
    
        soup = BeautifulSoup(response.text, "lxml")
    
        seldiv = soup.find("div", id="geneAnalysisDetail")
    
        return seldiv
    
    def parse(url):
        soup = fetch_html(url)
        rows= soup.find_all("tr")
    
        attributes = ["Gene", "Factor", "Family", "Position", "Relative orientation", "Relative Distance", "Max score", "Threshold Score", "Score"]
    
        save_rows = []
        for i in range(2, len(rows)):
            cols = rows[i].find_all("td")
            lst = []
            for col in cols:
                text = col.get_text()
                text = text.strip(" ")
                text = text.strip("\n")
                lst.append(text)
            save_rows.append(lst)
        return save_rows
    
    url = "http://www.athamap.de/search_gene.php?agi=At1g76540%0D%0AAt3g12280%0D%0AAt4g28980%0D%0AAt4g37630%0D%0AAt5g11300%0D%0AAt5g27620%0D%0A&upstream=-500&downstream=50&restriction=0&sortBy1=gen&sortBy2=fac&sortBy3=pos&familySelected[ARF]=on&familySelected[CAMTA]=on&familySelected[GARP%2FARR-B]=on&formSubmitted=TRUE"
    save_rows = parse(url)
    for row in save_rows:
        print(row)
    

相关问题