Commit 60d733ac authored by hackhard's avatar hackhard
Browse files

Code after review

parent d197efb2
Pipeline #9369 passed with stage
in 2 minutes and 27 seconds
......@@ -51,10 +51,10 @@ class Analyzer:
self.__job_queue_delay: float = float(self.__config["job_queue_delay"])
# Public class attributes
self.soup_t: BeautifulSoup = BeautifulSoup("", "html.parser")
self.soup_n: BeautifulSoup = BeautifulSoup("", "html.parser")
self.max_k: int = 150
self.min_k: int = 20
self.soup_tor: BeautifulSoup = BeautifulSoup("", "html.parser")
self.soup_non_tor: BeautifulSoup = BeautifulSoup("", "html.parser")
self.max_threshold_value: int = 150
self.min_threshold_value: int = 20
self.match_list: List[str] = (
self.__db_session.query(MetaData)
.filter(MetaData.key == "analyzer_match_list")
......@@ -95,10 +95,10 @@ class Analyzer:
self.__db_session.query(FetchCompleted, Relay).join(Relay)
).all()
proxy_countries_html_code = []
proxy_countries_html_data = []
for query in query_by_proxy:
if query.Proxy.country == query_by_relay[0].Relay.country:
proxy_countries_html_code.append(query.FetchCompleted.html_data)
proxy_countries_html_data.append(query.FetchCompleted.html_data)
tor = query_by_domain.filter(Fetcher.uses_proxy_type == "tor").first()
non_tor = query_by_domain.filter(Fetcher.uses_proxy_type == None).first()
......@@ -120,7 +120,7 @@ class Analyzer:
HAR_json_tor,
non_tor.html_data,
HAR_json_non_tor,
proxy_countries_html_code,
proxy_countries_html_data,
)
# Non tor from the FetchCompleted
......@@ -221,22 +221,34 @@ class Analyzer:
)
# Similar
if score_proxy < self.min_k and score_tor < self.min_k:
if (
score_proxy < self.min_threshold_value
and score_tor < self.min_threshold_value
):
self.__logger.info("Not Blocked")
self.consensus_lite_dom_value = 0
# Tor Blocked
elif score_proxy < self.min_k and score_tor > self.min_k:
elif (
score_proxy < self.min_threshold_value
and score_tor > self.min_threshold_value
):
self.__logger.info("Tor Blocked")
self.consensus_lite_dom_value = 1
# Either Both Redirected to other page or both blocked
elif score_proxy > self.min_k and score_tor > self.min_k:
elif (
score_proxy > self.min_threshold_value
and score_tor > self.min_threshold_value
):
self.__logger.info("Both Redirected to other page or Both Blocked")
self.consensus_lite_dom_value = 2
# Proxy not good, Tor better
elif score_proxy > self.min_k and score_tor < self.min_k:
elif (
score_proxy > self.min_threshold_value
and score_tor < self.min_threshold_value
):
self.__logger.info("Tor unblocked, proxy blocked")
self.consensus_lite_dom_value = 3
......@@ -244,7 +256,8 @@ class Analyzer:
elif (
score_proxy < 0
and score_tor < 0
and (100 * abs(avg_proxy - tor_dom)) / avg_proxy < self.min_k
and (100 * abs(avg_proxy - tor_dom)) / avg_proxy
< self.min_threshold_value
):
self.__logger.info("Non-Tor blocked")
self.consensus_lite_dom_value = 4
......@@ -269,7 +282,7 @@ class Analyzer:
tor_c = 0
tor = 0
# If captcha in html of tor:
if "captcha" in self.soup_t and "captcha" not in self.soup_n:
if "captcha" in self.soup_tor and "captcha" not in self.soup_non_tor:
tor_c = 1
# If captcha in both, tor_html and non_tor html, or not anywhere:
else:
......@@ -295,7 +308,7 @@ class Analyzer:
self,
tor_html_data: str,
non_tor_html_data: str,
proxy_countries_html_code: List[str],
proxy_countries_html_data: List[str],
) -> None:
"""
Analyzes dom
......@@ -304,70 +317,64 @@ class Analyzer:
:type tor_html_data: str
:param non_tor_html_data: Non-Tor HTML data
:type non_tor_html_data: str
:param proxy_countries_html_code: List of Proxy html data
:type proxy_countries_html_code: List[str]
:param proxy_countries_html_data: List of Proxy html data
:type proxy_countries_html_data: List[str]
"""
self.soup_t = BeautifulSoup(tor_html_data, "html.parser")
count_t = 0
node_tor = []
node_non_tor = []
self.soup_tor = BeautifulSoup(tor_html_data, "html.parser")
# Count the number of nodes in tor
for tag in self.soup_t.find_all(True):
node_tor.append(tag)
count_t += 1
self.soup_n = BeautifulSoup(non_tor_html_data, "html.parser")
tor_node_count = len(self.soup_tor.find_all(True))
self.soup_non_tor = BeautifulSoup(non_tor_html_data, "html.parser")
# Count the number of nodes in non-tor
count_n = 0
for tag in self.soup_n.find_all(True):
node_non_tor.append(tag)
count_n += 1
non_tor_node_count = len(self.soup_non_tor.find_all(True))
# Count the number of nodes returned by the proxies
proxy_node_count = []
proxy_node_detail = []
captcha_val_proxy = []
for proxy_html in proxy_countries_html_code:
self.captcha_proxy_val = []
for proxy_html in proxy_countries_html_data:
node_proxy = []
soup_p = BeautifulSoup(proxy_html, "html.parser")
soup_proxy = BeautifulSoup(proxy_html, "html.parser")
# Check for captcha here itself, thereby reducing the space of the list ass well as execution later
captcha_proxy = str(soup_p).lower()
captcha_proxy = str(soup_proxy).lower()
# Contains captcha or not in forms of 0(No captcha) and 1(Captcha), so that it can be accessed via another class
captcha_val_proxy.append(int("captcha" in captcha_proxy))
for tag in soup_p.find_all(True):
self.captcha_proxy_val.append(int("captcha" in captcha_proxy))
for tag in soup_proxy.find_all(True):
node_proxy.append(tag)
proxy_node_detail.append(node_proxy)
proxy_node_count.append(len(node_proxy))
self.captcha_proxy_val = captcha_val_proxy
self.__logger.info(
"Nodes by tor: %f, non-tor: %f and proxies: %s",
count_t,
count_n,
tor_node_count,
non_tor_node_count,
proxy_node_count,
)
try:
dom_score = 100 * ((count_n - count_t) / count_t)
dom_score = abs(
100 * ((non_tor_node_count - tor_node_count) / tor_node_count)
)
except ZeroDivisionError as e:
self.__logger.info("Zero Error, check tor Dom: %s", e)
self.__logger.info("DOM Score : %s", dom_score)
self.soup_t = str(self.soup_t).lower()
self.soup_n = str(self.soup_n).lower()
self.soup_tor = str(self.soup_tor).lower()
self.soup_non_tor = str(self.soup_non_tor).lower()
if self.captcha_checker() is False:
if abs(dom_score) > 0:
if abs(dom_score) > self.max_k:
if dom_score > 0:
if dom_score > self.max_threshold_value:
# Random value to check the performance.
# Might need some more experiments to come back with the correct value
self.__logger.info("Tor most probably Errors!!")
self.dom_analyze_value = 0
# Call Consensus lite
self.consensus_lite_dom(count_t, count_n, proxy_node_count)
elif abs(dom_score) < self.min_k:
self.consensus_lite_dom(
tor_node_count, non_tor_node_count, proxy_node_count
)
elif dom_score < self.min_threshold_value:
# Random value to check the performance.
# Might need some more experiments to come back with the correct value
# checks for keywords to help in this case
......@@ -377,20 +384,19 @@ class Analyzer:
self.__logger.info("Doubtful case!!")
self.__logger.info("checking for keywords...")
# checks for keywords to help in this case
res = 0
for _ in self.match_list:
if _ in self.soup_t and _ not in self.soup_n:
res = 1
if res == 0:
self.__logger.info(
"Survived Checklist but still doubt (Further modules might help)"
)
self.dom_analyze_value = 2
else:
self.__logger.info("Tor Blocked : checklist!! ")
self.dom_analyze_value = 3
if _ in self.soup_tor and _ not in self.soup_non_tor:
self.__logger.info("Tor Blocked : checklist!! ")
self.dom_analyze_value = 3
else:
self.__logger.info(
"Survived Checklist but still doubt (Further modules might help)"
)
self.dom_analyze_value = 2
# Call Consensus lite
self.consensus_lite_dom(count_t, count_n, proxy_node_count)
self.consensus_lite_dom(
tor_node_count, non_tor_node_count, proxy_node_count
)
else:
# When DOM is equal
self.__logger.info("Equal")
......@@ -402,7 +408,7 @@ class Analyzer:
tor_http_requests: Dict[str, Any],
non_tor_html_data: str,
non_tor_http_requests: Dict[str, Any],
proxy_countries_html_code: List[str],
proxy_countries_html_data: List[str],
) -> None:
"""
......@@ -416,65 +422,62 @@ class Analyzer:
:type non_tor_html_data: str
:param non_tor_http_requests: Non-Tor HAR
:type non_tor_http_requests: Dict[str, Any]
:param proxy_countries_html_code: Html data of all given proxies matching the location of tor nodes.
:type proxy_countries_html_code: List[str]
:param proxy_countries_html_data: Html data of all given proxies matching the location of tor nodes.
:type proxy_countries_html_data: List[str]
"""
tor_H = {}
tor_N = {}
tor_HAR = {}
non_tor_HAR = {}
try:
for i in range(0, len(tor_http_requests["log"]["entries"])):
tor_H[
tor_HAR[
tor_http_requests["log"]["entries"][i]["request"]["url"]
] = tor_http_requests["log"]["entries"][i]["response"]["status"]
# pylint: disable=C0206
for i in tor_H:
if tor_H[i] != 0 or tor_H != "" or tor_H is not None:
self.tor_store[i] = tor_H[i] # type: ignore
for i in tor_HAR:
if tor_HAR[i] != 0 or tor_HAR != "" or tor_HAR is not None:
self.tor_store[i] = tor_HAR[i] # type: ignore
for i in range(len(non_tor_http_requests["log"]["entries"])):
tor_N[
non_tor_HAR[
non_tor_http_requests["log"]["entries"][i]["request"]["url"]
] = non_tor_http_requests["log"]["entries"][i]["response"]["status"]
# pylint: disable=C0206
for i in tor_N:
if tor_N[i] != 0 or tor_N != "" or tor_N is not None:
self.non_store[i] = tor_N[i] # type: ignore
for i in non_tor_HAR:
if non_tor_HAR[i] != 0 or non_tor_HAR != "" or non_tor_HAR is not None:
self.non_store[i] = non_tor_HAR[i] # type: ignore
first_url_t = list(self.tor_store.keys())[0]
first_status_t = self.tor_store[str(first_url_t)]
first_status_tor = int(self.tor_store[str(first_url_t)])
# non tor use HARExportTrigger
first_url_nt = list(self.non_store.keys())[0]
first_status_nt = self.non_store[str(first_url_nt)]
first_status_non_tor = int(self.non_store[str(first_url_nt)])
if int(first_status_t) > 399 and int(first_status_nt) < 400:
if first_status_tor > 399 and first_status_non_tor < 400:
# Error for tag and no error for non tor
self.__logger.info("Tor Blocked")
self.status_check_value = 0
elif int(first_status_t) > 399 and int(first_status_nt) > 399:
elif first_status_tor > 399 and first_status_non_tor > 399:
# Both blocked on tor and non-tor
self.__logger.info("Site is blocked on tor and non-tor browsers")
self.status_check_value = 1
elif int(first_status_t) < 300 and int(first_status_nt) > 399:
elif first_status_tor < 300 and first_status_non_tor > 399:
# When tor isn't blocked and non-tor is blocked
self.__logger.info(
"Tor is not blocked, rather non-tor browser is blocked"
)
self.status_check_value = 2
else:
if int(first_status_t) > 299 and int(first_status_t) < 400:
# pylint:disable=R1716
if (first_status_tor > 299 and first_status_tor < 400) or (
first_status_tor < 300 and first_status_non_tor < 300
):
# Chek if tor returns error pages or warning or captchas due to reload
self.dom_analyze(
tor_html_data, non_tor_html_data, proxy_countries_html_code
)
elif int(first_status_t) < 300 and int(first_status_nt) < 300:
# When both tor and non tor returns no errors
self.dom_analyze(
tor_html_data, non_tor_html_data, proxy_countries_html_code
tor_html_data, non_tor_html_data, proxy_countries_html_data
)
except TypeError:
self.__logger.debug(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment