我试图解析this site的表 . 我正在使用蟒蛇美丽的汤来做到这一点 . 虽然它在我朋友的Windows机器上产生错误的输出's producing correct output in my Ubuntu 14.04 machine, it' . 我在这里粘贴代码片段:
from bs4 import BeautifulSoup
def buildURL(agi, families):
#agi and families contains space seperated string of genes and families
genes = agi.split(" ")
families = families.split(" ")
base_url = "http://www.athamap.de/search_gene.php"
url = base_url
if len(genes):
url = url + "?agi="
for i, gene in enumerate(genes):
if i>0:
url = url + "%0D%0A"
url = url + gene
url = url + "&upstream=-500&downstream=50&restriction=0&sortBy1=gen&sortBy2=fac&sortBy3=pos"
for family in families:
family = family.replace("/", "%2F")
url = url +"&familySelected%5B"+family+"%5D=on"
url = url + "&formSubmitted=TRUE"
return url
def fetch_html(agi, families):
url = buildURL(agi, families)
response = requests.get(url)
soup = BeautifulSoup(str(response.text), "lxml")
divs = soup.find_all('div')
seldiv = ""
for div in divs:
try:
if div["id"] == "geneAnalysisDetail":
'''
This div contains interesting data
'''
seldiv = div
except:
None
return seldiv
def parse(seldiv):
soup = seldiv
rows= soup.find_all('tr')
attributes =["Gene", "Factor", "Family", "Position", "Relative orientation", "Relative Distance", "Max score", "Threshold Score", "Score"]
print attributes
save_rows = []
for i in range(2, len(rows)):
cols = rows[i].find_all('td')
lst = []
for j,col in enumerate(cols):
if j==0:
lst.append(re.sub('', '',str(col.contents[1].contents[0])))
elif j==1:
lst.append(str(col.contents[1].contents[0]))
elif j==2:
lst.append(str(col.contents[0]))
elif j==3:
lst.append(str(col.contents[1].contents[0]))
else:
lst.append(str(col.contents[0]))
save_rows.append(lst)
return save_rows
知道这里可能出现什么问题吗?我试过有没有lxml .
提前致谢 .
2 回答
一种可能是您没有为请求添加用户代理 . 不同的用户代理有时会得到不同的结果,尤其是来自怪异的网站 . 这是所有可能的代理商的列表,只需选择一个 . 它不一定是你的机器
你可以用这种方式解析表格,并且应该在两台机器上都能很好地工作 .
buildURL
函数应保持不变 .