python - What can I do to scrape 10000 pages without appearing captchas? -
hi there i've been trying collect information in 10,000 pages of page school project, thought fine until on page 4 got mistake. check page manually , find page asks me captcha.
what can avoid it? maybe set timer between searchs?
here code.
import bs4, requests, csv g_page = requests.get("http://www.usbizs.com/ny/new_york.html") m_page = bs4.beautifulsoup(g_page.text, "lxml") get_pnum = m_page.select('div[class="pagenav"]') max_page = int(get_pnum[0].text[9:16]) print("recolectando información de la página 1 de {}.".format(max_page)) contador = 0 information_list = [] k in range(1, max_page): c_items = m_page.select('div[itemtype="http://schema.org/corporation"] a') c_links = [] = 0 link in c_items: c_links.append(link.get("href")) i+=1 j in range(len(c_links)): temp = [] s_page = requests.get(c_links[j]) i_page = bs4.beautifulsoup(s_page.text, "lxml") print("ingresando a: {}".format(c_links[j])) info_t = i_page.select('div[class="infolist"]') info_1 = info_t[0].text info_2 = info_t[1].text temp = [info_1,info_2] information_list.append(temp) contador+=1 open ("list_information.cv", "w") file: writer=csv.writer(file) row in information_list: writer.writerow(row) print("información de {} clientes recolectada y guardada correctamente.".format(j+1)) g_page = requests.get("http://www.usbizs.com/ny/new_york-{}.html".format(k+1)) m_page = bs4.beautifulsoup(g_page.text, "lxml") print("recolectando información de la página {} de {}.".format(k+1,max_page)) print("programa finalizado. información recolectada de {} clientes.".format(contador))
Comments
Post a Comment