Unverified Commit e47248dc authored by Isis Lovecruft's avatar Isis Lovecruft
Browse files

Merge branch 'fix/9385-fuzzy-email-matching_r1' into develop

parents f8907b32 6a7d9d6d
......@@ -13,13 +13,16 @@
# :authors: The Tor Project, Inc.
# :license: This file is freely distributed as part of BridgeDB, see LICENSE
# for details.
# :copyright: (c) 2007-2013 The Tor Project, Inc.
# (c) 2007-2013, all sentient entities within the AUTHORS file
# :version: 0.0.9
# :copyright: (c) 2007-2014 The Tor Project, Inc.
# (c) 2007-2014, all sentient entities within the AUTHORS file
# :version: 0.0.10
#===============================================================================
#
# CHANGELOG:
# ~~~~~~~~~~
# Changes in version 0.0.10 - 2014-07-06
# * ADD EMAIL_BLACKLIST and EMAIL_FUZZY_MATCH settings.
#
# Changes in version 0.0.9 - 2014-06-06
# * ADD EMAIL_WHITELIST setting.
#
......@@ -311,6 +314,20 @@ EMAIL_DOMAIN_RULES = {'gmail.com': ["ignore_dots", "dkim"],
# A mapping of whitelisted email addresses to GnuPG key fingerprints:
EMAIL_WHITELIST = {}
# A list of blacklisted email addresses:
EMAIL_BLACKLIST = []
# An integer. This number will be used to calculate the Levenshtein String
# Distance between the 'From:' email address on an incoming client request and
# each of the blacklisted email addresses above. If the calculated distance is
# equal or less than the number below, the address is assumed to be related to
# one of the above blacklisted spammers. Basically, this allows for fuzzy
# matching the blacklisted email addresses. Decreasing this number will allow
# more email requests through; increasing will mean that a stricter match to
# one of the blacklisted addresses is required before the address is blocked.
# Set to `0` to disable.
EMAIL_FUZZY_MATCH = 4
# If there are any IPs in this list, only allow incoming connections from
# those IPs.
EMAIL_RESTRICT_IPS = []
......
......@@ -38,6 +38,7 @@ from bridgedb.email import request
from bridgedb.email import templates
from bridgedb.parse import addr
from bridgedb.parse.addr import canonicalizeEmailDomain
from bridgedb.util import levenshteinDistance
from bridgedb import translations
......@@ -646,6 +647,19 @@ class SMTPAutoresponder(smtp.SMTPClient):
if not dkim.checkDKIM(self.incoming.message, self.incoming.domainRules):
return False
# If fuzzy matching is enabled via the EMAIL_FUZZY_MATCH setting, then
# calculate the Levenshtein String Distance (see
# :func:`~bridgedb.util.levenshteinDistance`):
if self.incoming.context.fuzzyMatch != 0:
for blacklistedAddress in self.incoming.context.blacklist:
distance = levenshteinDistance(self.incoming.canonicalFromEmail,
blacklistedAddress)
if distance <= self.incoming.context.fuzzyMatch:
logging.info("Fuzzy-matched %s to blacklisted address %s!"
% (self.incoming.canonicalFromEmail,
blacklistedAddress))
return False
return True
def send(self, response, retries=0, timeout=30, reaktor=reactor):
......
......@@ -57,6 +57,11 @@ class MailServerContext(object):
:ivar str fromAddr: Use this address in the email :header:`From:`
line for outgoing mail. (default: ``bridges@torproject.org``)
:ivar int nBridges: The number of bridges to send for each email.
:ivar list blacklist: A list of blacklisted email addresses, taken from
the ``EMAIL_BLACKLIST`` config setting.
:ivar int fuzzyMatch: An integer specifying the maximum Levenshtein
Distance from an incoming email address to a blacklisted email address
for the incoming email to be dropped.
:ivar gpgContext: A ``gpgme.GpgmeContext`` (as created by
:func:`bridgedb.crypto.getGPGContext`), or None if we couldn't create
a proper GPGME context for some reason.
......@@ -92,6 +97,8 @@ class MailServerContext(object):
self.domainMap = config.EMAIL_DOMAIN_MAP or {}
self.canon = self.buildCanonicalDomainMap()
self.whitelist = config.EMAIL_WHITELIST or {}
self.blacklist = config.EMAIL_BLACKLIST or []
self.fuzzyMatch = config.EMAIL_FUZZY_MATCH or 0
self.gpgContext = getGPGContext(config)
......
......@@ -38,6 +38,8 @@ EMAIL_DOMAIN_RULES = {
}
EMAIL_DOMAINS = ["gmail.com", "example.com", "localhost"]
EMAIL_WHITELIST = {'white@list.ed': 'ABCD1234ABCD1234ABCD1234ABCD1234ABCD1234'}
EMAIL_BLACKLIST = ['feidanchaoren0001@gmail.com']
EMAIL_FUZZY_MATCH = 4
EMAIL_USERNAME = "bridges"
EMAIL_SMTP_HOST = "127.0.0.1"
EMAIL_SMTP_PORT = 25
......@@ -56,6 +58,8 @@ EMAIL_DOMAIN_MAP = %s
EMAIL_DOMAIN_RULES = %s
EMAIL_DOMAINS = %s
EMAIL_WHITELIST = %s
EMAIL_BLACKLIST = %s
EMAIL_FUZZY_MATCH = %s
EMAIL_USERNAME = %s
EMAIL_SMTP_HOST = %s
EMAIL_SMTP_PORT = %s
......@@ -72,6 +76,8 @@ EMAIL_PORT = %s
repr(EMAIL_DOMAIN_RULES),
repr(EMAIL_DOMAINS),
repr(EMAIL_WHITELIST),
repr(EMAIL_BLACKLIST),
repr(EMAIL_FUZZY_MATCH),
repr(EMAIL_USERNAME),
repr(EMAIL_SMTP_HOST),
repr(EMAIL_SMTP_PORT),
......
......@@ -562,3 +562,10 @@ class SMTPAutoresponderTests(unittest.TestCase):
self.message.lines.insert(3, header)
self._setUpResponder()
self.assertFalse(self.responder.runChecks(emailFrom))
def test_SMTPAutoresponder_runChecks_blacklisted(self):
"""runChecks() on an blacklisted email address should return False."""
emailFrom = Address('feidanchaoren0043@gmail.com')
self._getIncomingLines(str(emailFrom))
self._setUpResponder()
self.assertFalse(self.responder.runChecks(emailFrom))
......@@ -17,6 +17,7 @@ from __future__ import unicode_literals
import os
from twisted.mail.smtp import Address
from twisted.trial import unittest
from bridgedb import util
......@@ -58,3 +59,44 @@ class MiscLoggingUtilTests(unittest.TestCase):
from bridgedb.persistent import Conf
util.configureLogging(Conf())
util.logging.info("BridgeDB's email address: bridges@torproject.org")
class LevenshteinDistanceTests(unittest.TestCase):
"""Unittests for `bridgedb.util.levenshteinDistance."""
def test_levenshteinDistance_blank_blank(self):
"""The Levenshtein Distance between '' and '' should be 0."""
distance = util.levenshteinDistance('', '')
self.assertEqual(distance, 0)
def test_levenshteinDistance_cat_cat(self):
"""The Levenshtein Distance between 'cat' and 'cat' should be 0."""
distance = util.levenshteinDistance('cat', 'cat')
self.assertEqual(distance, 0)
def test_levenshteinDistance_bat_cat(self):
"""The Levenshtein Distance between 'bat' and 'cat' should be 1."""
distance = util.levenshteinDistance('bat', 'cat')
self.assertEqual(distance, 1)
def test_levenshteinDistance_bar_cat(self):
"""The Levenshtein Distance between 'bar' and 'cat' should be 2."""
distance = util.levenshteinDistance('bar', 'cat')
self.assertEqual(distance, 2)
def test_levenshteinDistance_bridgedb_doge(self):
"""The Levenshtein Distance between 'bridgedb' and 'doge' should be 6."""
distance = util.levenshteinDistance('bridgedb', 'doge')
self.assertEqual(distance, 6)
def test_levenshteinDistance_feidanchaoren0043_feidanchaoren0011(self):
"""The Levenshtein Distance between the usernames in
'feidanchaoren0043@gmail.com' and 'feidanchaoren0011@gmail.com' should
be less than an EMAIL_FUZZY_MATCH parameter.
"""
email1 = Address('feidanchaoren0043@gmail.com')
email2 = Address('feidanchaoren0011@gmail.com')
# Fuzzy match if the Levenshtein Distance is less than or equal to:
fuzzyMatch = 4
distance = util.levenshteinDistance(email1.local, email2.local)
self.assertLessEqual(distance, fuzzyMatch)
......@@ -142,6 +142,46 @@ def configureLogging(cfg):
logging.info("Level: %s", logLevel)
logging.info("Safe Logging: %sabled" % ("En" if safelogging else "Dis"))
def levenshteinDistance(s1, s2, len1=None, len2=None,
offset1=0, offset2=0, memo=None):
"""Compute the Levenstein Distance between two strings.
The `Levenshtein String Distance Algorithm`_ efficiently computes the
number of characters which must be changed in **s1** to make it
identical to **s2**.
.. `Levenshtein String Distance Algorithm`:
https://en.wikipedia.org/wiki/Levenshtein_distance
>>> levenshteinDistance('cat', 'cat')
0
>>> levenshteinDistance('cat', 'hat')
1
>>> levenshteinDistance('arma', 'armadillo')
5
:param str s1: The string which should be changed.
:param str s2: The string which **stringOne** should be compared to.
"""
len1 = len(s1) if len1 is None else len1
len2 = len(s2) if len2 is None else len2
memo = {} if memo is None else memo
key = ','.join([str(offset1), str(len1), str(offset2), str(len2)])
if memo.get(key) is not None: return memo[key]
if len1 == 0: return len2
elif len2 == 0: return len1
cost = 0 if (s1[offset1] == s2[offset2]) else 1
distance = min(
levenshteinDistance(s1, s2, len1-1, len2, offset1+1, offset2, memo) + 1,
levenshteinDistance(s1, s2, len1, len2-1, offset1, offset2+1, memo) + 1,
levenshteinDistance(s1, s2, len1-1, len2-1, offset1+1, offset2+1, memo) + cost,
)
memo[key] = distance
return distance
class JustifiedLogFormatter(logging.Formatter):
"""A logging formatter which pretty prints thread and calling function
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment