mass_validate_email.py 18 KB


  1. #!/usr/bin/python
  2. #
  3. # Python module to mass validating email address
  4. #
  5. # This module was inspired by (and use) validate_email library
  6. # write by Syrus Akbary :
  7. #
  8. # https://github.com/SyrusAkbary/validate_email
  9. #
  10. # This main goal is to optimize mass validating using cache of
  11. # bad (or good) domain or MX server.
  12. #
  13. # Author: Benjamin Renard
  14. # Website: https://gogs.zionetrix.net/bn8/mass_validate_email
  15. # Licence: LGPL
  16. """ Mass email addresses validation tools """
  17. import smtplib
  18. import socket
  19. import sys
  20. import logging
  21. import DNS
  22. from validate_email import validate_email
  23. try:
  24. DNS.DiscoverNameServers()
  25. except DNS.ServerError, err:
  26. logging.fatal("Error discovering DNS servers : %s", err)
  27. sys.exit(1)
  28. # Exception
  29. class EmailInvalid(Exception):
  30. """ Generic invalid email exception """
  31. def __init__(self, email, error_msg=None):
  32. self.email = email
  33. self.error_msg = error_msg or "Invalid email address"
  34. super(EmailInvalid, self).__init__("%s : %s" % (email, self.error_msg))
  35. class EmailInvalidSyntax(EmailInvalid):
  36. """ Exception raised when an email address is invalid by syntax """
  37. def __init__(self, email):
  38. super(EmailInvalidSyntax, self).__init__(email, "Invalid email address syntax")
  39. class EmailInvalidDomain(EmailInvalid):
  40. """ Exceptiond raise when an email address is from an invalid mail domain """
  41. def __init__(self, email, domain, cause):
  42. self.domain = domain
  43. self.cause = cause
  44. super(EmailInvalidDomain, self).__init__(email, "Invalid email domain : %s" % domain)
  45. class NoMXhostAvailable(EmailInvalid):
  46. """ Exception raised when an email address is from a mail domain without available MX host """
  47. def __init__(self, email, mx_hosts=None, mx_hosts_error=None):
  48. self.mx_hosts = mx_hosts
  49. self.mx_hosts_error = mx_hosts_error or {}
  50. if mx_hosts_error:
  51. super(NoMXhostAvailable, self).__init__(email, "No MX hosts available : %s" % ', '.join([mx_hosts_error[host].error_msg for host in mx_hosts_error]))
  52. else:
  53. super(NoMXhostAvailable, self).__init__(email, "No MX hosts available")
  54. class EmailRefused(EmailInvalid):
  55. """ Exception raised when an email address is refused by the MX host """
  56. def __init__(self, email, mx_host=None):
  57. self.mx_hosts = mx_host
  58. if mx_host:
  59. super(EmailRefused, self).__init__(email, "MX host %s refused this email" % mx_host)
  60. else:
  61. super(EmailRefused, self).__init__(email, "MX hosts refused this email")
  62. class MXUnavailable(EmailInvalid):
  63. """ Exception raised when an MX host is not available to validate an email address """
  64. def __init__(self, email, mx_host, error_msg=None):
  65. self.mx_host = mx_host
  66. super(MXUnavailable, self).__init__(email, error_msg or "%s : MX host %s unavailable" % (email, mx_host))
  67. class TemporaryErrorOnMX(MXUnavailable):
  68. """ Exception raised when an MX host raise a temporary error validating an email address """
  69. def __init__(self, email, mx_host, msg=None):
  70. self.msg = msg
  71. if msg:
  72. error_msg = "%s : temporary error occured on MX host %s : %s" % (email, mx_host, msg)
  73. else:
  74. error_msg = "%s : temporary error occured on MX host %s" % (email, mx_host)
  75. super(TemporaryErrorOnMX, self).__init__(email, mx_host, error_msg)
  76. class MXRefuseConnection(MXUnavailable):
  77. """ Exception raised when an MX host refuse connection validating an email address """
  78. def __init__(self, email, mx_host, msg=None):
  79. self.msg = msg
  80. if msg:
  81. error_msg = "%s : MX host %s refuse connection : %s" % (email, mx_host, msg)
  82. else:
  83. error_msg = "%s : MX host %s refuse connection" % (email, mx_host)
  84. super(MXRefuseConnection, self).__init__(email, mx_host, error_msg)
  85. # Options
  86. class OptionsClass(object):
  87. """ Class used to defined validation options """
  88. debug = False
  89. debugsmtp = False
  90. checkmx = False
  91. verifyaddress = False
  92. usesmtpvrfy = False
  93. acceptoncnxrefused = False
  94. acceptontemporaryerror = False
  95. raiseonerror = False
  96. options = OptionsClass()
  97. if options.verifyaddress:
  98. options.checkmx = True
  99. def clean_mail(mail):
  100. mail = str(mail).lower().strip()
  101. return mail
  102. # Cache domain info
  103. # Domains's MX hosts
  104. domains_mx_hosts = {}
  105. # List of valid domains
  106. valid_domains = []
  107. # List of invalid domains (with invalid cause)
  108. invalid_domains = {}
  109. # List of domain without available MX host (with unavailable cause)
  110. mx_unavailable_domain = {}
  111. def get_mail_domain_and_mx_hosts(mail):
  112. """ Retreive domain name and it's MX hosts from an email address """
  113. domain = mail[mail.find('@')+1:]
  114. if domain in domains_mx_hosts:
  115. return (domain, domains_mx_hosts[domain])
  116. if domain in invalid_domains:
  117. if options.raiseonerror:
  118. raise EmailInvalidDomain(mail, domain, invalid_domains[domain])
  119. return (domain, False)
  120. try:
  121. # Retreive domain's MX hosts info
  122. mx_hosts_info = DNS.mxlookup(domain)
  123. if len(mx_hosts_info) > 0:
  124. domains_mx_hosts[domain] = [mx_host_info[1] for mx_host_info in mx_hosts_info]
  125. logging.debug("MX of domain %s : %s", domain, ','.join(domains_mx_hosts[domain]))
  126. valid_domains.append(domain)
  127. return (domain, domains_mx_hosts[domain])
  128. # If domain have no MX hosts, try on domain name it self
  129. if connect_to_mx(domain):
  130. domains_mx_hosts[domain] = [domain]
  131. logging.debug("MX of domain %s : %s", domain, ','.join(domains_mx_hosts[domain]))
  132. valid_domains.append(domain)
  133. return (domain, domains_mx_hosts[domain])
  134. # No valid MX host found for this domain
  135. logging.debug("No valid MX of domain %s found", domain)
  136. invalid_domains[domain] = "No valid MX hosts found"
  137. except DNS.ServerError, err:
  138. logging.debug('Error getting MX servers of domain %s : %s', domain, err)
  139. invalid_domains[domain] = 'DNS server error getting MX hosts : %s' % err
  140. if options.raiseonerror:
  141. raise EmailInvalidDomain(mail, domain, invalid_domains[domain])
  142. return (domain, False)
  143. def check_mx(mail):
  144. """ MX check of an email address """
  145. domain, mx_hosts = get_mail_domain_and_mx_hosts(mail)
  146. if not mx_hosts:
  147. return False
  148. if not options.verifyaddress:
  149. # We don't have to connect on MX host : just check if domain have at least on MX host
  150. return bool(mx_hosts)
  151. if domain in mx_unavailable_domain:
  152. if options.raiseonerror:
  153. raise NoMXhostAvailable(mail, mx_hosts, mx_unavailable_domain[domain])
  154. return False
  155. # Check mail on MX hosts
  156. no_mx_available = True
  157. mx_unavailable_errors = []
  158. for mx_host in mx_hosts:
  159. con = connect_to_mx(mx_host)
  160. if not con:
  161. mx_unavailable_errors[mx_host] = "%s : Fail to connect on MX host" % mx_host
  162. continue
  163. no_mx_available = False
  164. try:
  165. if verify_mail_on_mx_host(domain, con, mail, accept_on_cnx_refused=options.acceptoncnxrefused):
  166. return True
  167. except EmailRefused:
  168. if options.raiseonerror:
  169. raise
  170. return False
  171. except MXUnavailable as err:
  172. mx_unavailable_errors[mx_host] = err
  173. if no_mx_available:
  174. mx_unavailable_domain[domain] = mx_unavailable_errors
  175. if options.raiseonerror:
  176. raise NoMXhostAvailable(mail, mx_hosts, mx_unavailable_domain[domain])
  177. elif options.raiseonerror:
  178. raise EmailRefused(mail)
  179. return False
  180. valid_mx = []
  181. invalid_mx = []
  182. def connect_to_mx(mx_host):
  183. """ Connect on a MX host and return the smtplib corresponding connection object """
  184. if mx_host in invalid_mx:
  185. return False
  186. try:
  187. smtp = smtplib.SMTP(timeout=5)
  188. smtp.connect(mx_host)
  189. if options.debugsmtp:
  190. smtp.set_debuglevel(True)
  191. valid_mx.append(mx_host)
  192. return smtp
  193. except smtplib.SMTPConnectError:
  194. logging.debug("MX server %s does not respond from SMTP", mx_host)
  195. except smtplib.SMTPServerDisconnected:
  196. logging.debug("MX server %s unexpectedly closed connection", mx_host)
  197. except socket.gaierror:
  198. logging.debug("Can't resolv MX server %s", mx_host)
  199. except socket.timeout:
  200. logging.debug("Connection timeout to SMTP server %s", mx_host)
  201. except socket.error:
  202. logging.debug("Connection error on SMTP server %s", mx_host)
  203. except Exception:
  204. logging.error("Unknown error connecting to SMTP server %s", mx_host, exc_info=True)
  205. invalid_mx.append(mx_host)
  206. return None
  207. mx_refuse_check_mail = {}
  208. def verify_mail_on_mx_host(mx_host, smtp, mail, accept_on_cnx_refused=False):
  209. """ Verify an email address on a specific MX host """
  210. if mx_host in mx_refuse_check_mail:
  211. if accept_on_cnx_refused:
  212. logging.debug('%s : MX host %s refused connection but consider email as validated', mail, mx_host)
  213. return True
  214. raise MXRefuseConnection(mail, mx_host, mx_refuse_check_mail[mx_host])
  215. try:
  216. status, msg = smtp.helo()
  217. if status != 250:
  218. mx_refuse_check_mail[mx_host] = msg
  219. if accept_on_cnx_refused:
  220. logging.debug('%s : MX host %s refused connection but consider email as validated', mail, mx_host)
  221. return True
  222. raise MXRefuseConnection(mail, mx_host, msg)
  223. if options.usesmtpvrfy:
  224. (status, msg) = smtp.verify(mail)
  225. logging.debug('%s : MX host %s return the code %s on VRFY command with the following message : %s', mail, mx_host, status, msg)
  226. if status >= 250 and status < 260:
  227. # Server normaly return an normalize email address
  228. for word in msg.split(' '):
  229. if validate_email(word):
  230. return True
  231. smtp.mail('')
  232. status, msg = smtp.rcpt(mail)
  233. if status >= 400 and status < 500:
  234. logging.debug('SMTP server return temporary error (code=%s) : %s', status, msg)
  235. if options.acceptontemporaryerror:
  236. logging.debug('%s : MX host %s raise a temporary error but consider email as validated', mail, mx_host)
  237. return True
  238. raise TemporaryErrorOnMX(mail, mx_host, msg)
  239. elif status != 250:
  240. if options.raiseonerror:
  241. raise EmailRefused(mail, mx_host)
  242. return False
  243. logging.debug('%s : MX host %s accept email for this address with the following message : %s', mail, mx_host, msg)
  244. return True
  245. except smtplib.SMTPServerDisconnected:
  246. # Server not permits verify user
  247. mx_refuse_check_mail[mx_host] = "server disconnected during the exchange"
  248. if accept_on_cnx_refused:
  249. logging.debug('%s : MX host %s refused connection but consider email as validated', mail, mx_host)
  250. return True
  251. raise MXRefuseConnection(mail, mx_host, mx_refuse_check_mail[mx_host])
  252. except smtplib.SMTPConnectError:
  253. raise MXUnavailable(mail, mx_host)
  254. def mass_validate_email(mail, simple=False):
  255. """ Validate an email address with mecanisms optimized for mass email addresses validation """
  256. mail = clean_mail(mail)
  257. if not validate_email(mail):
  258. if options.raiseonerror:
  259. raise EmailInvalidSyntax(mail)
  260. return
  261. elif simple:
  262. return True
  263. elif options.checkmx:
  264. return check_mx(mail)
  265. else:
  266. return True
  267. if __name__ == '__main__':
  268. from optparse import OptionParser, OptionGroup
  269. # Default options
  270. default_output_delimiter = ";"
  271. default_output_quotechar = '"'
  272. parser = OptionParser()
  273. # options
  274. parser.add_option(
  275. '-v',
  276. '--verbose',
  277. action="store_true",
  278. dest="verbose",
  279. help='Enable verbose mode'
  280. )
  281. parser.add_option(
  282. '-d',
  283. '--debug',
  284. action="store_true",
  285. dest="debug",
  286. help='Enable debug mode'
  287. )
  288. parser.add_option(
  289. '-p',
  290. '--progress',
  291. action='store_true',
  292. dest='progress',
  293. help='Enable progress bar',
  294. default=False
  295. )
  296. parser.add_option(
  297. '-D',
  298. '--debug-smtp',
  299. action="store_true",
  300. dest="debugsmtp",
  301. help='Enabled SMTP exchange debuging'
  302. )
  303. parser.add_option(
  304. '-m',
  305. '--mx',
  306. action="store_true",
  307. dest="checkmx",
  308. help='Enable MX check'
  309. )
  310. parser.add_option(
  311. '-V',
  312. '--verify',
  313. action="store_true",
  314. dest="verifyaddress",
  315. help="Enable email address verification on MX server. If this option is enabled, MX check is also automatically enabled."
  316. )
  317. parser.add_option(
  318. '--use-smtp-vrfy',
  319. action="store_true",
  320. dest="usesmtpvrfy",
  321. help="When MX check is enabled, enable the SMPT VRFY command usage"
  322. )
  323. parser.add_option(
  324. '--accept-email-on-cnx-refused',
  325. action="store_true",
  326. dest="acceptoncnxrefused",
  327. help="When MX check is enabled, accept email address even if MX server refuse the SMTP connection (after HELO command)"
  328. )
  329. parser.add_option(
  330. '--accept-on-temporary-error',
  331. action="store_true",
  332. dest="acceptontemporaryerror",
  333. help="When MX check is enabled, accept email address even if MX server return a temporary error (after trying to send an email to the checked address)"
  334. )
  335. parser.add_option(
  336. '-f',
  337. '--from-file',
  338. action="store",
  339. type='string',
  340. dest="fromfile",
  341. help="Read emails addresses to validate from from"
  342. )
  343. output_opts = OptionGroup(parser, u"Output options")
  344. output_opts.add_option(
  345. '-o',
  346. '--output-file',
  347. action="store",
  348. type='string',
  349. dest="output_file",
  350. help="Write emails addresses validation result as a CSV file"
  351. )
  352. output_opts.add_option(
  353. '--delimiter',
  354. action='store',
  355. type='string',
  356. dest='output_delimiter',
  357. help="CSV ouput file delimiter (Default: %s)" % default_output_delimiter,
  358. default=default_output_delimiter
  359. )
  360. output_opts.add_option(
  361. '--quotechar',
  362. action='store',
  363. type='string',
  364. dest='output_quotechar',
  365. help="CSV ouput file quote character (Default: %s)" % default_output_quotechar,
  366. default=default_output_quotechar
  367. )
  368. parser.add_option_group(output_opts)
  369. (opts, emails) = parser.parse_args()
  370. # Enable and configure logging
  371. if opts.debug:
  372. logging_level = logging.DEBUG
  373. options.debug = True
  374. elif opts.verbose:
  375. logging_level = logging.INFO
  376. else:
  377. logging_level = logging.WARNING
  378. logging.basicConfig(level=logging_level, format='%(asctime)s - %(levelname)s - %(message)s')
  379. # If fromfile options if setted, load emails
  380. if opts.fromfile:
  381. logging.info('Load emails addresses from %s', opts.fromfile)
  382. with open(opts.fromfile, 'r') as fd:
  383. for line in fd.readlines():
  384. email = line.strip()
  385. if email not in emails:
  386. emails.append(email)
  387. # Check at leat one email is provided
  388. if not emails:
  389. parser.error('You must specify emails address as arguments')
  390. # If output is enabled, import csv library
  391. if opts.output_file:
  392. import csv
  393. # Configure other options from command line arguments
  394. options.raiseonerror = True
  395. options.debugsmtp = opts.debugsmtp
  396. options.checkmx = opts.checkmx or opts.verifyaddress or opts.usesmtpvrfy
  397. options.verifyaddress = opts.verifyaddress
  398. options.usesmtpvrfy = opts.usesmtpvrfy
  399. options.acceptoncnxrefused = opts.acceptoncnxrefused
  400. options.acceptontemporaryerror = opts.acceptontemporaryerror
  401. if opts.progress:
  402. from progressbar import ProgressBar, Percentage, Bar, RotatingMarker, SimpleProgress, ETA
  403. pbar = ProgressBar(
  404. widgets=[
  405. 'Validating emails addresses : ',
  406. Percentage(),
  407. ' ',
  408. Bar(marker=RotatingMarker()),
  409. ' ',
  410. SimpleProgress(),
  411. ETA()
  412. ],
  413. maxval=len(emails)
  414. ).start()
  415. pbar_count = 0
  416. else:
  417. logging.info('Start emails addresses validation')
  418. validated = []
  419. not_validated = {}
  420. for email in emails:
  421. try:
  422. if mass_validate_email(email):
  423. logging.info('Address %s is valid', email)
  424. validated.append(email)
  425. else:
  426. logging.info('Address %s is NOT valid, but no exception raised : it is not supose to happen !', email)
  427. not_validated[email] = EmailInvalid(email)
  428. except EmailInvalid as err:
  429. not_validated[email] = err
  430. if opts.progress:
  431. pbar_count += 1
  432. pbar.update(pbar_count)
  433. if opts.progress:
  434. pbar.finish()
  435. if not_validated:
  436. logging.warning('%s on %s is NOT valid :\n- %s', len(not_validated), len(emails), '\n- '.join([str(not_validated[email]) for email in not_validated]))
  437. else:
  438. logging.info('All %s emails addresses provided are valid.', len(emails))
  439. if opts.output_file:
  440. logging.info('Write emails validation result to file %s', opts.output_file)
  441. with open(opts.output_file, 'w') as fd:
  442. csv_output = csv.writer(fd, delimiter=opts.output_delimiter, quotechar=opts.output_quotechar)
  443. for email in not_validated:
  444. csv_output.writerow([email, not_validated[email].error_msg])
  445. # Adapt exit code on validation result
  446. sys.exit(1 if not_validated else 0)