Welcome to OGeek Q&A Community for programmer and developer-Open, Learning and Share
Welcome To Ask or Share your Answers For Others

Categories

0 votes
124 views
in Technique[技术] by (71.8m points)

python 3.x - getting table value from nowgoal has got an index error

I am quite new to scraping. I am getting links from nowgoal below is how I started navigating to above page. I do not wish to get link for all matches. but I will have an input txt file, which is attached Here and use the selected league and date. The following code will initialize as input:

#Intialisation
league_index =[]
final_list = []
j = 0
#config load
config = RawConfigParser()
configFilePath = r'.config.txt'
config.read(configFilePath)
date = config.get('database_config','date')                     #input file provided by user - provide in YYYY-MM-DD format
leagues = config.get('database_config','leagues')               #input file provided by user - provide in windows format
headless_param =config.get('database_config','headless')        #Headless param - set True if you want to see bowser operating in foreground!
leagues_list = leagues.split(',')
print(leagues_list)

After I initialized with the preferred date and league, I will set up for chrome driver as follow:

options = webdriver.ChromeOptions()         #initialise webdriver options
#options.binary_location = brave_path        #if you are running the script on brave - then enable it
if headless_param == 'True' :
    print('headless')
    options.headless = True                 # if headeless parameter is set to true - the chrome browser will not appear in foreground
options.add_argument('start-maximized')     # Start the chrome maximised 
options.add_argument('disable-infobars')    # Disable infobars
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("prefs", {"profile.default_content_setting_values.cookies": 2})
options.add_experimental_option("prefs", {"profile.block_third_party_cookies": True})
options.add_experimental_option('useAutomationExtension', False)
options.add_argument("--incognito")         #Incognito mode


#intiate the driver
driver = webdriver.Chrome(resource_path('./drivers/chromedriver.exe'),options=options) 
#Format the url

url =  'http://www.nowgoal3.com/football/fixture/?f=ft0&date='+date


#get the url
driver.get(url)
#wait for some time
time.sleep(3)

driver.find_element_by_xpath('//*[@id="li_league"]').click()
time.sleep(5)
#click on the -team ranking
driver.find_element_by_xpath('//*[@id="TeamOrderCheck"]').click()

After this, you will be brought to the following page

I also add in the snap shot below here

I try to get the data from the table by looping: the code is as follow:

> #Get the leagues name from page htmlSource = driver.page_source
> #Pass the htmlsource into soup soup = bs4.BeautifulSoup(htmlSource,'html.parser')
> #Table table = soup.select('table[id="table_live"]')
> #Rows of table all_rows = table[0].select('tr')
> #loop through each row 
for i , row in enumerate(all_rows[2:]) :
>     try:
>         key_word = row['class'][0]
>         print(key_word)
>         if 'Leaguestitle' in key_word:#if leagues got changed
>             league = row.a.text
>             print(row.a.text)
>             if row.a.text in leagues_list:
>                 j =1
>             else:
>                 j =0                
>         elif j== 1:
>             home_team = row.findAll('a')[0].text                                                #home team
>             print(home_team)
>             away_team = row.findAll('a')[1].text                                                #away team
>             match_number = ''.join(filter(str.isdigit,row.findAll('a')[2]['href'].strip()))    
> #match_number
>             link  = 'http://data.nowgoal.group/3in1odds/'+match_number+'.html'            
> #link for 3 in 1 odds from the match code
>             home_ranking = row.findAll('span')[0].text.strip('[]')                              #home team ranking
>             away_ranking = row.findAll('span')[1].text.strip('[]')                              #Away team ranking
>             final_list.append([home_team,home_ranking,away_team,away_ranking,league,match_number,link])
>     except KeyError:
>         try:
>             if row['style']=='display:none':
>                 continue
>             elif j== 1:
>                 home_team = row.findAll('a')[0].text                                            #home team
>                 away_team = row.findAll('a')[1].text                                            #away team
>                 home_ranking = row.findAll('span')[0].text.strip('[]')                          #home team ranking
>                 away_ranking = row.findAll('span')[1].text.strip('[]')                          #Away team ranking
>                 match_number = ''.join(filter(str.isdigit,row.findAll('a')[2]['href'].strip()))
> #match_code associated with each match
>                 link  = 'http://data.nowgoal.group/3in1odds/'+match_number+'.html'            
> #link for 3 in 1 odds from the match code
>                 final_list.append([home_team,home_ranking,away_team,away_ranking,league,match_number,link])
>         except KeyError :
>             print('KeyError')
>             
> 
>     except IndexError:
>         if j== 1:
>             home_team = row.findAll('a')[0].text                                            #home team
>             away_team = row.findAll('a')[1].text                                            #away team
>             home_ranking = row.findAll('span')[0].text.strip('[]')                          #home team ranking
>             away_ranking = row.findAll('span')[1].text.strip('[]')                          #Away team ranking
>             match_number = ''.join(filter(str.isdigit,row.findAll('a')[2]['href'].strip()))
> #match_code associated with each match
>             link  = 'http://data.nowgoal.group/3in1odds/'+match_number+'.html'            
> #link for 3 in 1 odds from the match code
>             final_list.append([home_team,home_ranking,away_team,away_ranking,league,match_number,link])
>             print('IndexError-captured')        
> 
> print(final_list)#show the final result driver.quit()#close the
> browser

Then I print out the hometeam and the following results

Chelsea adtext-bg QC: MAY88.COM - NHà CáI H?P PHáP NA UY - TH??NG N?P 100% - HOàN TR? 100TR - H? TR? 24/7

Then it threw me an index error as follow:

Traceback (most recent call last):
  File "D:/Football matters/Sttratagem data access/Games By Numbers/Nowgoal scraping project/codes/NOWGOAL-20200721T024808Z-001/NOWGOAL/PYFILES/Link_extractor_v1.3.py", line 124, in <module>
    away_team = row.findAll('a')[1].text                                                #away team
IndexError: list index out of range

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "D:/Football matters/Sttratagem data access/Games By Numbers/Nowgoal scraping project/codes/NOWGOAL-20200721T024808Z-001/NOWGOAL/PYFILES/Link_extractor_v1.3.py", line 149, in <module>
    away_team = row.findAll('a')[1].text                                            #away team
IndexError: list index out of range

I am seeking your kind advice on that. I would be greatly appreciate for your help. Thanks, Zep.

question from:https://stackoverflow.com/questions/65929976/getting-table-value-from-nowgoal-has-got-an-index-error

与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
Welcome To Ask or Share your Answers For Others

1 Reply

0 votes
by (71.8m points)
league_list = league_list = ["English Premier League", 'Italian Serie A',
                             'England Championship', 'Spanish La Liga', 'Swedish Allsvenskan', 'USA Major League Soccer','Saudi','Dutch Cup']
#wait for some time



# wait for some time
wait.until(EC.element_to_be_clickable((By.ID, "li_league"))).click()
# click on the -team ranking
wait.until(EC.element_to_be_clickable(
    (By.XPATH, "//label[@for='TeamOrderCheck']/span"))).click()

for league in league_list:
    try:
        nextRow = wait.until(EC.presence_of_element_located(
            (By.XPATH, '//tr[.//a[contains(text(),"{}")]]'.format(league))))
        id = nextRow.get_attribute("id").split("_")[1]
        try:

            row = wait.until(EC.presence_of_all_elements_located(
                (By.XPATH, '//tr[preceding-sibling::tr[.//a[contains(text(),"{}")]] and following-sibling::tr[@id="tr_{}"] and not(@style="display:none")]'.format(league, int(id)+1))))
            print("########The result for {} ########".format(league))
            for i in row:
                print(i.get_attribute("textContent"))
            print("###########Completed##############".format(league))
        except:
            row = wait.until(EC.presence_of_all_elements_located(
                (By.XPATH, '//tr[preceding-sibling::tr[.//a[contains(text(),"{}")]] and not(@style="display:none")]'.format(league))))
            print("########The result for {} ########".format(league))
            for i in row:
                print(i.get_attribute("textContent"))
            print("###########Completed##############".format(league))
            continue
    except:
        continue

you can use following and preceeding property , as there is no unique way to identify next following element we have to take id and increment it with 1


与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
OGeek|极客中国-欢迎来到极客的世界,一个免费开放的程序员编程交流平台!开放,进步,分享!让技术改变生活,让极客改变未来! Welcome to OGeek Q&A Community for programmer and developer-Open, Learning and Share
Click Here to Ask a Question

...