首页 文章

python pandas模块无法获取电影名称

提问于
浏览 808
0

我有这些关于网络抓取的测试代码我正在尝试但我无法从网站上获取所有电影的名称 . 这是守则

from requests import get
 from bs4 import BeautifulSoup
 import pandas as pd

 url = 'http://www.imdb.com/search/title? 
 release_date=2017&sort=num_votes,desc&page=1'

 response = get(url)
 print(response.text[:500])



 html_soup = BeautifulSoup(response.text, 'html.parser')
  type(html_soup)

  movie_containers = html_soup.find_all('div', class_ = 'lister-item 
  mode-advanced')
  print(type(movie_containers))
  print(len(movie_containers))

   first_movie = movie_containers[0]
   first_movie

first_movie.div

 first_movie.a

 first_movie.h3

first_movie.h3.a

first_name = first_movie.h3.a.text

first_year = first_movie.h3.find('span', class_ = 'lister-item-year text- 
muted unbold ')


print(first_movie.strong)

first_imdb = float(first_movie.strong.text)
print"IMDB= " ,first_imdb

first_mscore = first_movie.find('span', class_ = 'metascore favorable')

first_mscore = int(first_mscore.text)
print "First MetaScore", first_mscore

 first_votes = first_movie.find('span', attrs = {'name':'nv'})
 first_votes['data-value']
  first_votes = int(first_votes['data-value'])
 print "First_Votes=",first_votes

eighth_movie_mscore = movie_containers[7].find('div', class_ = 'ratings- 
metascore')
 type(eighth_movie_mscore)


 # Lists to store the scraped data in
  names = []
  years = []
  imdb_ratings = []
   metascores = []
   votes = []

# Extract data from individual movie container
for container in movie_containers:

    # If the movie has Metascore, then extract:
    if container.find('div', class_ = 'ratings-metascore') is not None:

    # The name
    name = container.h3.a.text
    names.append(name)

    # The year
    year = container.h3.find('span', class_ = 'lister-item-year').text
    years.append(year)

    # The IMDB rating
    imdb = float(container.strong.text)
    imdb_ratings.append(imdb)

    # The Metascore
    m_score = container.find('span', class_ = 'metascore').text
    metascores.append(int(m_score))

    # The number of votes
    vote = container.find('span', attrs = {'name':'nv'})['data-value']
    votes.append(int(vote))


test_df = pd.DataFrame({
                    'movie': names,
                   'year': years,
                   'imdb': imdb_ratings,
                   'metascore': metascores,
                   'votes': votes})
 print(test_df.info())
   print (test_df)

输出不显示电影的名称,但其余显示没有任何问题 . RangeIndex:46个条目,0到45个数据列(共5列):imdb 46非null float64 metascore 46非null int64电影46非空对象投票46非空int64年46非空对象dtypes:float64( 1),int64(2),对象(2)内存使用:1.9 KB

1 回答

  • -1
    from requests import get
    from bs4 import BeautifulSoup
    import pandas as pd
    
    url = 'http://www.imdb.com/search/title?release_date=2017&sort=num_votes,desc&page=1'
    response = get(url)
    print(response.text[:500])
    
    
    
    html_soup = BeautifulSoup(response.text, 'html.parser')
    type(html_soup)
    
    movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')
    print(type(movie_containers))
    print(len(movie_containers))
    
    first_movie = movie_containers[0]
    first_movie
    
    first_movie.div
    
    first_movie.a
    
    first_movie.h3
    
    first_movie.h3.a
    
    first_name = first_movie.h3.a.text
    
    first_year = first_movie.h3.find('span', class_ = 'lister-item-year text- muted unbold ')
    
    
    print(first_movie.strong)
    
    first_imdb = float(first_movie.strong.text)
    print("IMDB= ", first_imdb)
    
    first_mscore = first_movie.find('span', class_ = 'metascore favorable')
    
    first_mscore = int(first_mscore.text)
    print ("First MetaScore", first_mscore)
    
    first_votes = first_movie.find('span', attrs = {'name':'nv'})
    first_votes['data-value']
    first_votes = int(first_votes['data-value'])
    print ("First_Votes=",first_votes)
    
    eighth_movie_mscore = movie_containers[7].find('div', class_ = 'ratings-metascore')
    type(eighth_movie_mscore)
    
    
    # Lists to store the scraped data in
    names = []
    years = []
    imdb_ratings = []
    metascores = []
    votes = []
    
    # Extract data from individual movie container
    for container in movie_containers:
    
        # If the movie has Metascore, then extract:
        if container.find('div', class_ = 'ratings-metascore') is not None:
    
          # The name
          name = container.h3.a.text
          names.append(name)
    
          # The year
          year = container.h3.find('span', class_ = 'lister-item-year').text
          years.append(year)
    
          # The IMDB rating
          imdb = float(container.strong.text)
          imdb_ratings.append(imdb)
    
          # The Metascore
          m_score = container.find('span', class_ = 'metascore').text
          metascores.append(int(m_score))
    
          # The number of votes
          vote = container.find('span', attrs = {'name':'nv'})['data-value']
          votes.append(int(vote))
    
    
    test_df = pd.DataFrame({
                       'movie': names,
                       'year': years,
                       'imdb': imdb_ratings,
                       'metascore': metascores,
                       'votes': votes})
    print(test_df.info())
    print (test_df)
    

    这对我有用 .
    enter image description here

相关问题