Selenium - Bypassing Cloudflare JS challenges with PyAutoGUI for Low Volume Applications.
Cloudflare JS challenges is a common way to prevent bot activity. There’s multiple guides and libraries, such as cloudflare-scrape and selenium-stealth, but currently cloudflare’s JS detection seems to be robust against these libraries by detecting whether a selenium flag is set on the browser end.
The way I’ve been bypassing these JS challenges is by using PyAutoGUI.
Consider a vanilla implementation of the chrome class
import time
from typing import Optional, Tuple, Union
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.chrome.webdriver import WebDriver
from webdriver_manager.chrome import ChromeDriverManager
class Chrome:
"""Chrome driver helps us to navigate website.
Attributes:
_driver: The driver object for chrome.
_page_load_timeout: Timeout we use if the page does not load.
_wait_per_page: Time in seconds to wait before we get page source.
_wait_before_execute: Time in seconds to wait before we execute any scripts.
_headless: If we are not using a debugger, we can set headless to create a headless chrome.
"""
def __init__(
self,
headless: bool = False,
wait_before_execute: int = 5,
wait_per_page: int = 10,
page_load_timeout: int = 10,
):
"""Initialize chrome webdriver.
Args:
headless: Whether or not to headless.
wait_before_execute: The number of seconds to wait before we execute any script.
wait_per_page: The number of seconds to wait per page load.
page_load_timeout: The number of seconds to raise a timeout exception.
"""
self._page_load_timeout: int = page_load_timeout
self._wait_per_page: int = wait_per_page
self._wait_before_execute: int = wait_before_execute
self._headless: bool = headless
self._driver: WebDriver
self._options: ChromeOptions
self._driver, self._options = self.create_driver(
headless=headless,
)
def create_driver(
self,
headless: bool = False,
) -> Tuple[webdriver.Chrome, ChromeOptions]:
"""Initialize firefox webdriver.
Args:
headless: Headless flag.
Returns:
A tuple of driver, driver options, and the debugger window if available.
"""
options: ChromeOptions = ChromeOptions()
options.headless = headless
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("useAutomationExtension", False)
driver: webdriver.Chrome = webdriver.Chrome(
executable_path=ChromeDriverManager().install(),
options=options,
)
driver.switch_to.window(driver.current_window_handle)
driver.maximize_window()
driver.set_page_load_timeout(self._page_load_timeout)
return driver, options
def get(self, url: str, wait: int = 0) -> str:
"""Get the url, and focus.
Args:
url: URL for the driver to go to.
wait: Override the wait variable.
Returns:
Current page's html source.
"""
try:
self._driver.get(url)
except TimeoutException:
pass
wait_per_page: int = wait if wait != 0 else self._wait_per_page
time.sleep(self._wait_before_execute)
self._driver.execute_script("window.focus();")
time.sleep(5)
self._driver.execute_script("window.focus();")
time.sleep(wait_per_page)
page_source: str = self._driver.page_source
return page_source
def get_url(self) -> str:
"""Get the current url.
Note that using this unsafe function makes the code thread unsafe.
Returns:
Current url.
"""
current_url: str = self._driver.current_url
return current_url
@property
def driver(self) -> Union[webdriver.Firefox, webdriver.Chrome]:
"""Webdriver for firefox."""
return self._driver
You can use it by calling Chrome
:
driver = Chrome()
driver.get("https://www.google.com")
If we were to use this on a webpage with JS challenge enabled:
And then you’ll never proceed to the site. The issue is that most libraries I tried would change the browser to the setting to be like a user, which would never work since there’s a flag that you cannot turn off. You could however use a previously existing user directory, but the solution is only temporary since your cookie would expire, and there would be another prompt to solve the challenge.
The easier way I discovered without using any libraries except selenium is:
1. Chrome window from command line: google-chrome
.
2. Enable remote debugging port on Chrome by setting a flag: -remote-debugging-port=
, setting this would not trigger the automation flag, but if you would to connect to this port, then the flag would be enabled.
3. Set the homepage of Chrome to a JS challenged page: --homepage
.
4. Set an existing user data dir --user-data-dir
otherwise there are prompts when you seem to be a new user, this can be any directory on your disk.
5. Use PyAutoGUI
to control your mouse movement for something like a bezier curve when you are at the JS challenged homepage.
6. After you have passed the challenge, you can re-connect to Chrome by using Selenium, until when you cookie expire and you would need to redo 1~5 again.
The command would look like: google-chrome --remote-debugging-port=8888 --user-data-dir=/temp --start-maximized --homepage https://maxlei.dev
.
Since selenium hasn’t connected to Chrome yet, the automation flag would not be enabled, and since you are controlling some random mouse movement using PyAutoGUI
then Cloudflare would not detect this.
The major downside is that you would need an active window, and Chrome cannot be headless, but this method is pretty useful if you are only scraping the page once per couple minutes.
Consider an upgraded version of the scraper:
"""Navigates websites."""
import logging
import os
import signal
import time
import socket
import subprocess
from typing import Union, Callable, List, Optional, Tuple
import random
import math
import pyautogui
import numpy as np
import numpy.typing as npt
from scipy import interpolate
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.chrome.webdriver import WebDriver
from webdriver_manager.chrome import ChromeDriverManager
def search_free_port() -> int:
"""Find an open port.
Returns:
An available open port on the system.
"""
sock: socket.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.bind(("", 0))
sock.listen(1)
free_port: int = sock.getsockname()[1]
sock.close()
return free_port
def mouse_bezier_curve(repeats: int = 10, move: bool = True) -> None:
"""Mouse bezier curve.
Installing tkinter:
https://askubuntu.com/questions/1224230/how-to-install-tkinter-for-python-3-8
Args:
repeats: Number of times to repeat the curve.
move: Move the mouse or click.
Based on:
https://stackoverflow.com/questions/44467329/pyautogui-mouse-movement-with-bezier-curve
"""
for _ in range(repeats):
def point_dist(point_x1: int, point_y1: int, point_x2: int, point_y2: int) -> float:
return math.sqrt((point_x2 - point_x1) ** 2 + (point_y2 - point_y1) ** 2)
control_points: int = random.randint(10, 15) # Number of control points. Must be at least 2.
x_window_size: int
y_window_size: int
x_window_size, y_window_size = pyautogui.size()
point_x1: int
point_y1: int
point_x2: int
point_y2: int
point_x1, point_y1, point_x2, point_y2 = (
random.randint(100, x_window_size - 100),
random.randint(200, y_window_size - 100),
random.randint(100, x_window_size - 100),
random.randint(200, y_window_size - 100),
)
# Distribute control points between start and destination evenly.
point_x: npt.NDArray[np.int64] = np.linspace(point_x1, point_x2, num=control_points, dtype="int")
point_y: npt.NDArray[np.int64] = np.linspace(point_y1, point_y2, num=control_points, dtype="int")
# Randomise inner points a bit (+-random_inner_points at most).
random_inner_points: int = 1
random_x: List[int] = [random.randint(-random_inner_points, random_inner_points) for k in range(control_points)]
random_y: List[int] = [random.randint(-random_inner_points, random_inner_points) for k in range(control_points)]
random_x[0] = random_y[0] = random_x[-1] = random_y[-1] = 0
point_x += random_x
point_y += random_y
# Approximate using Bezier spline.
degree: int = 3 if control_points > 3 else control_points - 1 # Degree of b-spline. 3 is recommended.
# Must be less than number of control points.
ticks: npt.NDArray[np.float64]
certain_points: npt.NDArray[np.float64]
ticks, certain_points = interpolate.splprep([point_x, point_y], k=degree) # noqa
# Move upto a certain number of points
certain_points = np.linspace(0, 1, num=2 + int(point_dist(point_x1, point_y1, point_x2, point_y2) / 25.0))
points: npt.NDArray[np.float64]
points = interpolate.splev(certain_points, ticks)
# Move mouse.
duration: float = random.uniform(0.1, 0.3)
timeout: float = duration / len(points[0])
point_list: List[Tuple[int, int]] = list(zip(*(i.astype(int) for i in points)))
if move:
for point in point_list:
pyautogui.moveTo(*point)
pyautogui.click()
time.sleep(timeout)
def open_command_line(
command: str,
post_command_func: Optional[Callable[[], None]] = None,
check_grep: Optional[str] = None,
check_grep_delay: int = 5,
) -> int:
"""Open a command line tool.
Args:
command: The command to run.
post_command_func: A Post command to immediately run after the subprocess.
check_grep: Check for the command if it is successful.
check_grep_delay: Time to sleep before checking grep.
Returns:
A pid of the process, killable through os.killpg.
"""
process: subprocess.Popen = subprocess.Popen( # type: ignore # noqa
command,
stdout=subprocess.PIPE,
shell=True,
preexec_fn=os.setsid,
)
if post_command_func:
post_command_func()
if check_grep:
time.sleep(check_grep_delay)
output: subprocess.CompletedProcess = subprocess.run( # type: ignore
check_grep,
shell=True,
check=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
assert len(output.stdout.decode().split("\n")) > 2, f"Command: {command}, was not opened correctly."
logging.info(f"open_command_line: {command}, stdout: {output.stdout!r}, stderr: {output.stderr!r}")
return process.pid
class Chrome:
"""Chrome driver helps us to navigate website.
Attributes:
_options: Options we use for chrome.
_driver: The driver object for chrome.
_page_load_timeout: Timeout we use if the page does not load.
_wait_per_page: Time in seconds to wait before we get page source.
_wait_before_execute: Time in seconds to wait before we execute any scripts.
_reset_captcha: Reset chrome if we see captcha.
_debugger: Connect to an existing chrome instead of creating a new one from web driver.
_headless: If we are not using a debugger, we can set headless to create a headless chrome.
_user_data_dir: Directory of an existing chrome profile.
_debugger_home_page: Home page of the debugger.
_chrome_command: Command to run chrome.
_debugger_pid: Process id of the debugger.
_free_port: Free port we can use for the debugger.
"""
def __init__(
self,
headless: bool = False,
wait_before_execute: int = 5,
wait_per_page: int = 10,
page_load_timeout: int = 10,
user_data_dir: Optional[str] = None,
debugger: bool = False,
debugger_home_page: Optional[str] = None,
reset_captcha: bool = False,
):
"""Initialize chrome webdriver.
Args:
headless: Whether or not to headless.
wait_before_execute: The number of seconds to wait before we execute any script.
wait_per_page: The number of seconds to wait per page load.
page_load_timeout: The number of seconds to raise a timeout exception.
user_data_dir: Chrome's user data location, usually in ~/.config/google-chrome.
debugger: If enabled then create a chrome session using chrome command line.
debugger_home_page: The home page of the debugger when it opens, useful for circumventing blocks because
the chrome instance is not controlled by webdriver initially.
reset_captcha: Reset if page encounters captcha.
"""
self._page_load_timeout: int = page_load_timeout
self._wait_per_page: int = wait_per_page
self._wait_before_execute: int = wait_before_execute
self._reset_captcha: bool = reset_captcha
self._debugger: bool = debugger
self._headless: bool = headless
self._user_data_dir: Optional[str] = user_data_dir
self._debugger_home_page: Optional[str] = debugger_home_page
self._chrome_command: Optional[str] = None
self._free_port: Optional[int] = None
self._driver: WebDriver
self._options: ChromeOptions
self._debugger_pid: Optional[int]
self._driver, self._options, self._debugger_pid = self.create_driver(
headless=headless,
user_data_dir=user_data_dir,
debugger=debugger,
debugger_home_page=debugger_home_page,
)
def create_driver(
self,
headless: bool = False,
user_data_dir: Optional[str] = None,
debugger: bool = False,
debugger_home_page: Optional[str] = None,
) -> Tuple[webdriver.Chrome, ChromeOptions, Optional[int]]:
"""Initialize firefox webdriver.
Args:
headless: Whether or not to headless.
user_data_dir: Chrome's user data location, usually in ~/.config/google-chrome.
debugger: If enabled then create a chrome session using chrome command line.
debugger_home_page: The home page of the debugger when it opens, useful for circumventing blocks because
the chrome instance is not controlled by webdriver initially.
Returns:
A tuple of driver, driver options, and the debugger window if available.
"""
options: ChromeOptions = ChromeOptions()
options.headless = headless
if user_data_dir is not None:
options.add_argument(f"user-data-dir={user_data_dir}")
if user_data_dir is not None and not os.path.isdir(user_data_dir):
logging.warning(f"Chrome's user_data_dir: {user_data_dir} is not a valid location, download might fail.")
debugger_pid: Optional[int] = None
if debugger:
assert user_data_dir is not None, "To use the debugger mode must require a user data dir."
self._free_port = search_free_port()
# https://peter.sh/experiments/chromium-command-line-switches/
chrome_command: str = (
f"google-chrome "
f"--remote-debugging-port={self._free_port} "
f'--user-data-dir="{user_data_dir}" '
f"--start-maximized "
)
if debugger_home_page:
chrome_command += f'--homepage "{debugger_home_page}" '
self._chrome_command = chrome_command
debugger_pid = open_command_line(
command=self._chrome_command,
check_grep=f"ps -ef | grep chrome | grep {self._free_port}",
)
options.add_experimental_option("debuggerAddress", f"127.0.0.1:{self._free_port}")
time.sleep(10)
if not debugger:
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("useAutomationExtension", False)
driver: webdriver.Chrome = webdriver.Chrome(
executable_path=ChromeDriverManager().install(),
options=options,
)
driver.switch_to.window(driver.current_window_handle)
driver.maximize_window()
driver.set_page_load_timeout(self._page_load_timeout)
return driver, options, debugger_pid
def get(self, url: str, wait: int = 0) -> str:
"""Get the url, and focus.
Args:
url: URL for the driver to go to.
wait: Override the wait variable.
Returns:
Current page's html source.
"""
# Reset the browser N times until it hits the counter, then we will break anyways.
reset_count = 5
current_reset_count = 0
while True:
current_reset_count += 1
try:
# Can happen to some pages like dmhy url, we just let it pass, because we want to move the page
self._driver.get(url)
except TimeoutException:
pass
# If override wait it not zero, then we can wait shorter, however if we have proxy then we need to
# wait double the time
wait_per_page: int = wait if wait != 0 else self._wait_per_page
# Need to sleep to make sure everything works
time.sleep(self._wait_before_execute)
# We need to focus the window, otherwise cloudflare bot checks will not work
self._driver.execute_script("window.focus();")
# Need to sleep to make sure everything works
time.sleep(5)
# We need to focus the window, otherwise cloudflare bot checks will not work
self._driver.execute_script("window.focus();")
# Need to sleep to make sure everything works
time.sleep(wait_per_page)
# The page_source is the html file at the current page at url
page_source: str = self._driver.page_source
if current_reset_count > reset_count:
logging.error("Captcha resets too many times, probably will not work for the future runs.")
# self._reset_captcha = False
break
if (
(
"needs to review the security" in page_source.lower()
or "your request has been blocked" in page_source.lower()
)
and self._reset_captcha
and self._debugger
and self._chrome_command
):
# Don't need to save the pid since we will destroy the driver anyways
open_command_line(
command=self._chrome_command,
post_command_func=mouse_bezier_curve,
check_grep=f"ps -ef | grep chrome | grep {self._free_port}",
)
time.sleep(60)
self.cleanup()
self._driver, self._options, self._debugger_pid = self.create_driver(
headless=self._headless,
user_data_dir=self._user_data_dir,
debugger=self._debugger,
debugger_home_page=self._debugger_home_page,
)
time.sleep(10)
continue
break
return page_source
def get_url(self) -> str:
"""Get the current url.
Note that using this unsafe function makes the code thread unsafe.
Returns:
Current url.
"""
current_url: str = self._driver.current_url
return current_url
@property
def driver(self) -> Union[webdriver.Firefox, webdriver.Chrome]:
"""Webdriver for firefox."""
return self._driver
def cleanup(self) -> None:
"""Cleanup firefox driver."""
self._driver.quit()
if self._debugger and isinstance(self._debugger_pid, int):
os.killpg(os.getpgid(self._debugger_pid), signal.SIGTERM)
A couple of functions we need such as bezier_curve
and open_command_line
, but most of the functions stays the same except get
where we try to detect the words “needs to review the security” so that we can kill the current browser using os.killpg
.
When you use this class to visit a site with Cloudflare JS challenges enabled you will see:
But the first time you will be greeted with some prompt that comes with a fresh installation of Chrome, but after that is you are using the same user data dir then the same prompt will not reappear.