base_fetcher.py 16.6 KB
Newer Older
1
import os
2
import json
Barkin Simsek's avatar
Barkin Simsek committed
3
import shutil
Barkin Simsek's avatar
Barkin Simsek committed
4
import logging
5
from typing import Any, List, Tuple, Union, Optional
Barkin Simsek's avatar
Barkin Simsek committed
6

7
from selenium import webdriver
8
from selenium.common.exceptions import WebDriverException
9
10
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
11
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
Barkin Simsek's avatar
Barkin Simsek committed
12

Barkin Simsek's avatar
Barkin Simsek committed
13
from captchamonitor.utils.config import Config
14
from captchamonitor.utils.exceptions import MissingProxy, HarExportExtensionError
Barkin Simsek's avatar
Barkin Simsek committed
15
16
17
18


class BaseFetcher:
    """
19
    Base fetcher class that will be inherited by the actual fetchers, used to unify
Barkin Simsek's avatar
Barkin Simsek committed
20
21
22
23
    the fetcher interfaces
    """

    def __init__(
24
        self,
Barkin Simsek's avatar
Barkin Simsek committed
25
26
        config: Config,
        url: str,
27
28
        proxy: Optional[Tuple[str, int]] = None,
        use_proxy_type: Optional[str] = None,
Barkin Simsek's avatar
Barkin Simsek committed
29
30
        page_timeout: int = 30,
        script_timeout: int = 30,
31
        url_change_timeout: int = 30,
Barkin Simsek's avatar
Barkin Simsek committed
32
33
        options: Optional[dict] = None,
    ) -> None:
Barkin Simsek's avatar
Barkin Simsek committed
34
        """
35
        Initializes the fetcher with given arguments and tries to fetch the given URL
Barkin Simsek's avatar
Barkin Simsek committed
36

37
38
        :param config: Config class
        :type config: Config
Barkin Simsek's avatar
Barkin Simsek committed
39
40
        :param url: The URL to fetch
        :type url: str
41
42
43
44
        :param proxy: Proxy host and port, defaults to None
        :type proxy: Optional[Tuple[str, int]], optional
        :param use_proxy_type: Proxy type to use: "tor" or "http", defaults to None
        :type use_proxy_type: Optional[str], optional
45
        :param page_timeout: Maximum time allowed for a web page to load, defaults to 30
46
        :type page_timeout: int
47
        :param script_timeout: Maximum time allowed for a JS script to respond, defaults to 30
48
        :type script_timeout: int
49
50
        :param url_change_timeout: Maximum time allowed while waiting for driver URL to change, defaults to 30
        :type url_change_timeout: int
Barkin Simsek's avatar
Barkin Simsek committed
51
        :param options: Dictionary of options to pass to the fetcher, defaults to None
52
        :type options: Optional[dict], optional
53
        :raises MissingProxy: If use_proxy_type is not None but no proxy provided
Barkin Simsek's avatar
Barkin Simsek committed
54
        """
55
        # Public class attributes
56
        self.url: str = url
57
        self.use_proxy_type: Optional[str] = use_proxy_type
58
59
        self.page_timeout: int = page_timeout
        self.script_timeout: int = script_timeout
60
        self.url_change_timeout: int = url_change_timeout
61
        self.options: Optional[dict] = options
62
63
64
        self.gdpr_remove: bool = False
        self.gdpr_wait_for_url_change: bool = False
        self.gdpr_keywords: List[str] = ["Accept"]
65
66
        self.container_host: str
        self.container_port: str
Barkin Simsek's avatar
Barkin Simsek committed
67
68
69
70
71
        self.driver: webdriver.Remote
        self.page_source: str
        self.page_cookies: str
        self.page_title: str
        self.page_har: str
Barkin Simsek's avatar
Barkin Simsek committed
72

73
74
        # Protected class attributes
        self._logger = logging.getLogger(__name__)
75
76
77
        self._proxy: Optional[Tuple[str, int]] = proxy
        self._proxy_host: str
        self._proxy_port: int
78
79
80
81
82
83
        self._config: Config = config
        self._selenium_options: Any
        self._selenium_executor_url: str
        self._desired_capabilities: webdriver.DesiredCapabilities
        self._num_retries_on_fail: int = 3
        self._delay_in_seconds_between_retries: int = 3
Barkin Simsek's avatar
Barkin Simsek committed
84

85
86
87
88
89
90
91
92
        # Check if use_proxy_type is set to True but proxy is not passed
        if (self.use_proxy_type is not None) and (self._proxy is None):
            raise MissingProxy

        # Extract the proxy host and port
        if self._proxy is not None:
            self._proxy_host = str(self._proxy[0])  # type: ignore
            self._proxy_port = int(self._proxy[1])  # type: ignore
93

94
95
        # Update default options with the specified ones
        if self.options is not None:
96
97
98
99
100
            self.gdpr_remove = self.options.get("gdpr_remove", self.gdpr_remove)
            self.gdpr_wait_for_url_change = self.options.get(
                "gdpr_wait_for_url_change", self.gdpr_wait_for_url_change
            )
            self.gdpr_keywords = self.options.get("gdpr_keywords", self.gdpr_keywords)
101
102
            self.page_timeout = self.options.get("page_timeout", page_timeout)
            self.script_timeout = self.options.get("script_timeout", script_timeout)
103
104
105
            self.url_change_timeout = self.options.get(
                "url_change_timeout", url_change_timeout
            )
106

107
108
109
110
        # Get the extension path for xpi
        self._har_export_extension_xpi: str = self._config[
            "asset_har_export_extension_xpi"
        ]
111

112
113
        # Get the extension id for xpi
        self._har_export_extension_xpi_id: str = self._config[
114
115
116
            "asset_har_export_extension_xpi_id"
        ]

117
118
119
120
        # Get the extension path for crx
        self._har_export_extension_crx: str = self._config[
            "asset_har_export_extension_crx"
        ]
121

122
123
        self._check_extension_validity(self._har_export_extension_xpi, ".xpi")
        self._check_extension_validity(self._har_export_extension_crx, ".crx")
124

125
    @staticmethod
126
    def _get_selenium_executor_url(container_host: str, container_port: str) -> str:
127
128
129
130
131
132
133
134
135
136
137
138
        """
        Returns the command executor URL that will be used by Selenium remote webdriver

        :param container_host: Host to the Selenium remote webdriver
        :type container_host: str
        :param container_port: Port to the Selenium remote webdriver
        :type container_port: str
        :return: Command executor URL
        :rtype: str
        """
        return f"http://{container_host}:{container_port}/wd/hub"

139
    def _connect_to_selenium_remote_web_driver(
Barkin Simsek's avatar
Barkin Simsek committed
140
141
142
143
144
145
        self,
        container_name: str,
        desired_capabilities: webdriver.DesiredCapabilities,
        command_executor: str,
        options: Optional[list] = None,
    ) -> None:
146
147
148
149
150
151
152
153
154
        """
        Connects Selenium remote driver to a browser container

        :param container_name: Name of the target browser, just will be used for logging
        :type container_name: str
        :param desired_capabilities: webdriver.DesiredCapabilities object from Selenium
        :type desired_capabilities: webdriver.DesiredCapabilities object
        :param command_executor: Command executor URL for Selenium
        :type command_executor: str
Barkin Simsek's avatar
Barkin Simsek committed
155
156
        :param options: webdriver.Options from Selenium, defaults to None
        :type options: webdriver.Options object, optional
157
158
        """
        # Connect to browser container
159
160
161
162
163
        self.driver = webdriver.Remote(
            desired_capabilities=desired_capabilities,
            command_executor=command_executor,
            options=options,
        )
164
165

        # Set driver timeout
166
167
        self.driver.set_page_load_timeout(self.page_timeout)

168
        # Set timeout for HAR export trigger extension
169
        self.driver.set_script_timeout(self.script_timeout)
170
171

        # Log the current status
172
        self._logger.debug("Connected to the %s container", container_name)
173

174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
    def _check_extension_validity(self, extension: str, endswith: str) -> None:
        """
        Checks if given extension file exists and is valid

        :param extension: Absolute path to the extension file
        :type extension: str
        :param endswith: The file extension for the browser extension
        :type endswith: str
        :raises HarExportExtensionError: If given extension is not valid
        """
        if not os.path.isfile(extension):
            self._logger.warning(
                "Provided extension file doesn't exist: %s",
                extension,
            )
            raise HarExportExtensionError

        if not extension.endswith(endswith):
            self._logger.warning(
                "Provided extension file is not valid: %s",
                extension,
            )
            raise HarExportExtensionError

    def _install_har_export_extension_xpi(self, directory: str) -> None:
199
        """
200
        Installs the HAR Export Trigger extension to Firefox based browsers
201
202
203
204

        :param directory: Absolute directory path to install the extension
        :type directory: str
        """
205
        addon_path = os.path.join(directory, self._har_export_extension_xpi_id)
206
207
208
        if not os.path.exists(directory):
            os.makedirs(directory)
            os.chmod(directory, 0o755)
209
        shutil.copy(self._har_export_extension_xpi, addon_path + ".xpi")
210

211
212
213
214
215
216
217
218
219
220
221
    def _install_har_export_extension_crx(
        self, chrome_options: webdriver.ChromeOptions
    ) -> None:
        """
        Installs the HAR Export Trigger extension to Chromium based browsers

        :param chrome_options: webdriver.ChromeOptions from the Selenium driver
        :type chrome_options: webdriver.ChromeOptions
        """
        chrome_options.add_extension(self._har_export_extension_crx)

222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
    def _setup_common_firefox_based_fetcher(self, ff_profile: FirefoxProfile) -> None:
        """
        Performs the common setup procedures for Firefox based fetchers, including Firefox itself

        :param ff_profile: Firefox Profile created for the webdriver
        :type ff_profile: FirefoxProfile
        """
        # Get the executor URL
        self._selenium_executor_url = self._get_selenium_executor_url(
            self.container_host, self.container_port
        )

        # Install the extensions
        self._install_har_export_extension_xpi(ff_profile.extensionsDir)

        # Enable the network monitoring tools to record HAR
        ff_profile.set_preference("devtools.netmonitor.enabled", True)
        ff_profile.set_preference("devtools.toolbox.selectedTool", "netmonitor")
        ff_profile.set_preference("devtools.netmonitor.har.compress", False)
        ff_profile.set_preference(
            "devtools.netmonitor.har.includeResponseBodies", False
        )
        ff_profile.set_preference("devtools.netmonitor.har.jsonp", False)
        ff_profile.set_preference("devtools.netmonitor.har.jsonpCallback", False)
        ff_profile.set_preference("devtools.netmonitor.har.forceExport", False)
        ff_profile.set_preference(
            "devtools.netmonitor.har.enableAutoExportToFile", False
        )
        ff_profile.set_preference("devtools.netmonitor.har.pageLoadedTimeout", "2500")

        # Stop updates
        ff_profile.set_preference("app.update.enabled", False)

        # Set connections to Tor if we need to use Tor
256
        if self.use_proxy_type == "tor":
257
258
            ff_profile.set_preference("network.proxy.type", 1)
            ff_profile.set_preference("network.proxy.socks_version", 5)
259
260
            ff_profile.set_preference("network.proxy.socks", str(self._proxy_host))
            ff_profile.set_preference("network.proxy.socks_port", int(self._proxy_port))
261
262
            ff_profile.set_preference("network.proxy.socks_remote_dns", True)

263
264
        elif self.use_proxy_type == "http":
            ff_profile.set_preference("network.proxy.type", 1)
265
266
            ff_profile.set_preference("network.proxy.proxy_over_tls", True)
            ff_profile.set_preference("network.proxy.share_proxy_settings", False)
267
268
269
270
            ff_profile.set_preference("network.proxy.http", str(self._proxy_host))
            ff_profile.set_preference("network.proxy.http_port", int(self._proxy_port))
            ff_profile.set_preference("network.proxy.ssl", str(self._proxy_host))
            ff_profile.set_preference("network.proxy.ssl_port", int(self._proxy_port))
271
272
            ff_profile.set_preference("network.proxy.ftp", str(self._proxy_host))
            ff_profile.set_preference("network.proxy.ftp_port", int(self._proxy_port))
273

274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
        # Apply the preferences
        ff_profile.update_preferences()

        # Set selenium related options for Firefox Browser
        self._desired_capabilities = webdriver.DesiredCapabilities.FIREFOX.copy()
        self._selenium_options = webdriver.FirefoxOptions()
        self._selenium_options.profile = ff_profile
        self._selenium_options.add_argument("--devtools")

    def _setup_common_chromium_based_fetcher(self) -> None:
        """
        Performs the common setup procedures for Chromium based fetchers, including Chromium itself
        """
        # Get the executor URL
        self._selenium_executor_url = self._get_selenium_executor_url(
            self.container_host, self.container_port
        )

        self._selenium_options = webdriver.ChromeOptions()

        # Install the extensions
        self._install_har_export_extension_crx(self._selenium_options)

        # Enable the network monitoring tools to record HAR
        self._selenium_options.add_argument("--auto-open-devtools-for-tabs")

        # Set connections to Tor if we need to use Tor
301
        if self.use_proxy_type == "tor":
302
            # Set Tor as proxy
303
            proxy = f"socks5://{self._proxy_host}:{self._proxy_port}"
304
305
            self._selenium_options.add_argument(f"--proxy-server={proxy}")

306
307
308
309
310
311
312
313
314
315
        elif self.use_proxy_type == "http":
            proxy = f"{self._proxy_host}:{self._proxy_port}"
            self._desired_capabilities["proxy"] = {
                "httpProxy": proxy,
                "ftpProxy": proxy,
                "sslProxy": proxy,
                "proxyType": "MANUAL",
            }
            self._desired_capabilities["acceptSslCerts"] = True

316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
    def _remove_gdpr_popup(self) -> None:

        self._logger.debug("Trying to remove the GDPR popup")

        # Produce a similar string using the keywords:
        # ["Souhlasím", "Alle akzeptieren", "Jag godkänner"]
        keywords_str = '", "'.join(map(str, self.gdpr_keywords))
        keywords_array_str = f'arr = ["{keywords_str}"]'

        js_gdpr_remover = (
            keywords_array_str
            + """
                for (var i = 0; i < arr.length; i++) {
                    if (document.documentElement.innerHTML.includes(arr[i])) {
                        s = (arr[i]);
                        path = "//*[contains(., '" + s + "')]";
hackhard's avatar
hackhard committed
332
                        console.log(path);
333
                        x = document.evaluate(path, document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
hackhard's avatar
hackhard committed
334
335
336
337
338
339
                        for (var j = 0; j < x.snapshotLength; j++) {
                            try {
                                x.snapshotItem(j).click();
                            } catch (err) {
                                console.log(err)
                            }
hackhard's avatar
hackhard committed
340
                        }
341
342
                    }
                }
hackhard's avatar
hackhard committed
343
                """
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
        )

        # Get a copy of the URL
        old_url = self.driver.current_url

        # Execute the GDPR remover
        self.driver.execute_script(js_gdpr_remover)

        if self.gdpr_wait_for_url_change:
            WebDriverWait(self.driver, self.url_change_timeout).until(
                EC.url_changes(old_url)
            )
            WebDriverWait(self.driver, self.url_change_timeout).until(
                lambda driver: driver.execute_script("return document.readyState")
                == "complete"
            )

361
    def _fetch_with_selenium_remote_web_driver(self) -> None:
362
363
364
        """
        Fetches the given URL with the remote web driver
        """
365
        self.driver.get(self.url)
366

367
368
369
        if self.gdpr_remove:
            self._remove_gdpr_popup()

370
371
372
        self.page_source = self.driver.page_source
        self.page_cookies = self.driver.get_cookies()
        self.page_title = self.driver.title
373
        har_dict = self.driver.execute_async_script(
374
375
376
377
378
            """
            var callback = arguments[arguments.length - 1];
            HAR.triggerExport().then((harLog) => { callback(harLog) });
            """
        )
379
        self.page_har = json.dumps({"log": har_dict})
380

Barkin Simsek's avatar
Barkin Simsek committed
381
    def get_selenium_logs(self) -> dict:
382
383
384
385
386
387
388
389
390
391
392
        """
        Obtains and returns all kinds of available Selenium logs

        :return: Dictionary of logs with different log types
        :rtype: dict
        """
        logs = {}
        for log_type in self.driver.log_types:
            logs[log_type] = self.driver.get_log(log_type)
        return logs

Barkin Simsek's avatar
Barkin Simsek committed
393
394
395
    def get_screenshot_from_selenium_remote_web_driver(
        self, image_type: Optional[str] = "base64"
    ) -> Union[str, bytes]:
396
        """
397
        Takes a screenshot of the current page
398
399
400
401
402
403
404
405
406
407

        :param image_type: Type of screenshot to return, defaults to "base64"
        :type image_type: str, optional
        :return: Screenshot as png file or base64 encoded depending on selected type]
        :rtype: png file or str depending on selected type
        """
        if image_type == "base64":
            return self.driver.get_screenshot_as_base64()
        # else
        return self.driver.get_screenshot_as_png()
408

409
    def close(self) -> None:
410
411
412
        """
        Clean up before going out of scope
        """
413
        if hasattr(self, "driver"):
414
415
416
417
418
            try:
                self.driver.quit()
            except WebDriverException:
                # We can safely ignore "No active session with ID XXXXX" exceptions
                pass