TheDancerCodes · douglasdcm · Jul 23, 2023 · Jul 23, 2023
diff --git a/README_ASYNC.md b/README_ASYNC.md
@@ -0,0 +1,20 @@
+# Run the asynchronous crawler
+## Simple start
+Install the lastest version of **Caqui**
+
+```
+pip install caqui
+```
+Start the WebDriver as a server
+```
+$ ./chromedriver --port=9999
+
+Starting ChromeDriver 94.0.4606.61 (418b78f5838ed0b1c69bb4e51ea0252171854915-refs/branch-heads/4606@{#1204}) on port 9999
+Only local connections are allowed.
+Please see https://chromedriver.chromium.org/security-considerations for suggestions on keeping ChromeDriver safe.
+ChromeDriver was started successfully.
+```
+Run the crawler
+```
+$ python webscraping_example_async.py
+```
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1 @@
+caqui
diff --git a/webscraping_example_async.py b/webscraping_example_async.py
@@ -0,0 +1,84 @@
+# from selenium import webdriver
+# from selenium.webdriver.common.by import By
+# from selenium.webdriver.support.ui import WebDriverWait
+# from selenium.webdriver.support import expected_conditions as EC
+# from selenium.common.exceptions import TimeoutException
+import asyncio
+from caqui.easy import AsyncDriver
+from caqui.by import By
+
+
+async def async_scraping():
+    # Specifying incognito mode as you launch your browser[OPTIONAL]
+    # option = webdriver.ChromeOptions()
+    # option.add_argument("--incognito")
+
+    # Create new Instance of Chrome in incognito mode
+    # browser = webdriver.Chrome(executable_path='/Library/Application Support/Google/chromedriver', chrome_options=option)
+    remote = "http://127.0.0.1:9999"
+    capabilities = {
+        "desiredCapabilities": {
+            "name": "webdriver",
+            "browserName": "chrome",
+            "acceptInsecureCerts": True,
+            # "goog:chromeOptions": {"extensions": [], "args": ["--headless"]},
+        }
+    }
+    browser = AsyncDriver(remote, capabilities)
+
+    # Wait 20 seconds for page to load
+    timeout = 20
+    await browser.implicitly_wait(timeout)
+
+    # Go to desired website
+    await browser.get("https://github.com/TheDancerCodes")
+
+    # try:
+    #     # Wait until the final element [Avatar link] is loaded.
+    #     # Assumption: If Avatar link is loaded, the whole page would be relatively loaded because it is among
+    #     # the last things to be loaded.
+    #     WebDriverWait(browser, timeout).until(
+    #         EC.visibility_of_element_located(
+    #             (By.XPATH, "//img[@class='avatar width-full rounded-2']")
+    #         )
+    #     )
+    # except TimeoutException:
+    #     print("Timed out waiting for page to load")
+    #     browser.quit()
+
+    # Get all of the titles for the pinned repositories
+    # We are not just getting pure titles but we are getting a selenium object
+    # with selenium elements of the titles.
+
+    # find_elements_by_xpath - Returns an array of selenium objects.
+    # titles_element = browser.find_elements_by_xpath("//a[@class='text-bold']")
+    titles_element = await browser.find_elements(By.XPATH, "//a[@class='text-bold']")
+
+    # List Comprehension to get the actual repo titles and not the selenium objects.
+    titles = [x.text for x in titles_element]
+
+    # print response in terminal
+    print("TITLES:")
+    print(titles, "\n")
+
+    # Get all of the pinned repo languages
+    # language_element = browser.find_elements_by_xpath("//p[@class='mb-0 f6 text-gray']")
+    language_element = await browser.find_elements(
+        By.XPATH, "//p[@class='mb-0 f6 text-gray']"
+    )
+    languages = [
+        x.text for x in language_element
+    ]  # same concept as for-loop/ list-comprehension above.
+
+    # print response in terminal
+    print("LANGUAGES:")
+    print(languages, "\n")
+
+    # Pair each title with its corresponding language using zip function and print each pair
+    for title, language in zip(titles, languages):
+        print("RepoName : Language")
+        print(title + ": " + language, "\n")
+
+
+if __name__ == "__main__":
+    asyncio.run(async_scraping())