#!/usr/bin/env python # coding: utf-8 # In[1]: from selenium import webdriver from selenium.common.exceptions import NoSuchElementException # In[2]: driver = webdriver.Firefox() # In[3]: import time class DuckDuckGoResults(object): def __init__(self,driver): self.driver = driver def search(self,searchTerm): self.driver.get("https://duckduckgo.com/") inputSearchElm = driver.find_element_by_css_selector('#search_form_input_homepage') inputSearchElm.send_keys("%s\n" % searchTerm) def scroll_botton(self): self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);") return True def load_all_results(self): self.resultElms = self.driver.find_elements_by_css_selector("#links>div.results_links_deep") while True : self.scroll_botton() time.sleep(4) newResultElms = self.driver.find_elements_by_css_selector("#links>div.results_links_deep") if len(newResultElms) == len(self.resultElms): self.resultElms = newResultElms break self.resultElms = newResultElms def parse_resultElm(self,resultElm): try : resultAElm = resultElm.find_element_by_css_selector("a.result__a") result = { "title" : resultAElm.text, "href" : resultAElm.get_attribute("href"), "snippet" : resultElm.find_element_by_css_selector("div.result__snippet").text } return result except NoSuchElementException: print "exception",resultElm.text def get_results_python(self): return map(self.parse_resultElm,self.resultElms) def get_results_javascript(self): jsFunction = """ var resultElms = Array.prototype.slice.call(document.querySelectorAll("#links>div.results_links_deep")) return resultElms.map(function(resultElm) { var result = []; var resultAElm = resultElm.querySelector("a.result__a"); result.push(["title",resultAElm.textContent]); result.push(["href",resultAElm.getAttribute("href")]); result.push(["snippet",resultElm.querySelector("div.result__snippet").textContent]); return result; }); """ results = self.driver.execute_script(jsFunction) return map(dict,results) # In[4]: page = DuckDuckGoResults(driver) # In[5]: page.search("python") # In[6]: page.load_all_results() # In[7]: jsResults = page.get_results_javascript() # In[8]: pyResults = page.get_results_python() # In[9]: len(pyResults) == len(jsResults) # In[ ]: