Last active
February 23, 2026 11:16
-
-
Save aslamanver/5655ef2ab00c717f1695b45b946eddcb to your computer and use it in GitHub Desktop.
Extract domains and IPs from various AdBlock Plus filter formats
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import re | |
| def extract_domain_ip(line, skip_exceptional_rules=True): | |
| """ | |
| Extract domains and IPs from various AdBlock Plus filter formats | |
| Args: | |
| line (str): Line to extract domain or ip from | |
| skip_exceptional_rules (bool): Whether to skip exceptional rules | |
| Returns: | |
| tuple: (domain, ip) if valid, None if line is invalid | |
| """ | |
| # Skip empty lines | |
| if not line or not line.strip(): | |
| return (None, None) | |
| line = line.strip() | |
| # Skip comments (! or #) | |
| if line.startswith("!") or line.startswith("#"): | |
| return (None, None) | |
| # Skip regex patterns | |
| if line.startswith("/") and line.endswith("/"): | |
| return (None, None) | |
| # Skip lines with modifiers ($) | |
| if "$" in line: | |
| return (None, None) | |
| domain = None | |
| # Skip ALL exception rules (any line starting with @@) if flag is True | |
| if skip_exceptional_rules and line.startswith("@@"): | |
| return (None, None) | |
| # Handle exception rules (@@||, @@|, @@-) - only reached if skip_exceptional_rules=False | |
| if line.startswith("@@||"): | |
| domain = line[4:].split("^", 1)[0].split("|", 1)[0].strip() | |
| elif line.startswith("@@|"): | |
| domain = line[3:].split("^", 1)[0].split("|", 1)[0].strip() | |
| elif line.startswith("@@-"): | |
| domain = line[3:].split("^", 1)[0].strip() | |
| # Handle standard block rules (||) | |
| elif line.startswith("||"): | |
| domain = line[2:].split("^", 1)[0].strip() | |
| # Handle single pipe (|) | |
| elif line.startswith("|"): | |
| domain = line[1:].split("^", 1)[0].split("|", 1)[0].strip() | |
| # Handle dot prefix (.) | |
| elif line.startswith("."): | |
| domain = line[1:].split("^", 1)[0].strip() | |
| # Handle protocol (://) | |
| elif line.startswith("://"): | |
| domain = line[3:].split("^", 1)[0].strip() | |
| # Handle dash prefix (-) - extract domain after dash | |
| elif line.startswith("-"): | |
| domain = line[1:].split("^", 1)[0].strip() | |
| # Handle plain domain (might have ^ at end) | |
| elif "^" in line and not line.startswith("/"): | |
| domain = line.split("^", 1)[0].strip() | |
| if not domain: | |
| return (None, None) | |
| # Clean up wildcards (including dash-wildcard combinations) | |
| domain = ( | |
| domain.replace("-*.", ".").replace("*.", "").replace("-*", "").replace("*", "") | |
| ) | |
| domain = domain.lower() | |
| # Remove any trailing special characters | |
| domain = domain.rstrip(".|^") | |
| # Skip if empty after cleanup | |
| if not domain: | |
| return (None, None) | |
| # Skip if contains spaces or special URL characters (likely not a clean domain) | |
| if " " in domain or "?" in domain or "&" in domain: | |
| return (None, None) | |
| # Skip if domain starts or ends with dash or dot (invalid domain) | |
| if domain.startswith("-") or domain.endswith("-"): | |
| return (None, None) | |
| if domain.startswith(".") or domain.endswith("."): | |
| return (None, None) | |
| # Check for valid IP addresses first (full 4-octet IPs) | |
| if re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", domain): | |
| return (None, domain) | |
| # Skip if domain is only digits and dots (partial IP or invalid pattern like 158.247.208) | |
| if re.match(r"^[\d.]+$", domain): | |
| return (None, None) | |
| # Return as valid domain | |
| return (domain, None) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import unittest | |
| import cf | |
| import requests | |
| class TestExtractDomainAndIP(unittest.TestCase): | |
| """Test cases for extract_domain_ip function""" | |
| def test_standard_block_rules(self): | |
| """Test ||domain^ format""" | |
| domain, ip = cf.extract_domain_ip("||doubleclick.net^") | |
| self.assertEqual(domain, "doubleclick.net") | |
| self.assertIsNone(ip) | |
| domain, ip = cf.extract_domain_ip("||example.com^") | |
| self.assertEqual(domain, "example.com") | |
| self.assertIsNone(ip) | |
| def test_wildcard_removal(self): | |
| """Test wildcard pattern removal""" | |
| # Test *.domain.com | |
| domain, ip = cf.extract_domain_ip("||*.example.com^") | |
| self.assertEqual(domain, "example.com") | |
| self.assertIsNone(ip) | |
| # Test domain*.com (wildcard between letters gets removed without dot) | |
| domain, ip = cf.extract_domain_ip("||putrr*.com^") | |
| self.assertEqual(domain, "putrrcom") # * is removed, leaving putrrcom | |
| self.assertIsNone(ip) | |
| # Test reachableads-av.*.amazonaws.com | |
| domain, ip = cf.extract_domain_ip("||reachableads-av.*.amazonaws.com^") | |
| self.assertEqual(domain, "reachableads-av.amazonaws.com") | |
| self.assertIsNone(ip) | |
| def test_exception_rules_skipped(self): | |
| """Test @@ exception patterns are skipped""" | |
| # @@|| pattern should be skipped | |
| domain, ip = cf.extract_domain_ip("@@||ad.10010.com^") | |
| self.assertIsNone(domain) | |
| self.assertIsNone(ip) | |
| # @@| pattern should be skipped | |
| domain, ip = cf.extract_domain_ip("@@|affiliate.notion.so^|") | |
| self.assertIsNone(domain) | |
| self.assertIsNone(ip) | |
| # @@- pattern should be skipped (originally reported issue) | |
| domain, ip = cf.extract_domain_ip("@@-ds.metric.gstatic.com^") | |
| self.assertIsNone(domain) | |
| self.assertIsNone(ip) | |
| def test_exception_rules_extracted_when_enabled(self): | |
| """Test @@ exception patterns are extracted when skip_exceptional_rules=False""" | |
| # @@|| pattern should extract domain | |
| domain, ip = cf.extract_domain_ip( | |
| "@@||ad.10010.com^", skip_exceptional_rules=False | |
| ) | |
| self.assertEqual(domain, "ad.10010.com") | |
| self.assertIsNone(ip) | |
| # @@| pattern should extract domain | |
| domain, ip = cf.extract_domain_ip( | |
| "@@|affiliate.notion.so^|", skip_exceptional_rules=False | |
| ) | |
| self.assertEqual(domain, "affiliate.notion.so") | |
| self.assertIsNone(ip) | |
| # @@- pattern should extract domain (after removing @@-) | |
| domain, ip = cf.extract_domain_ip( | |
| "@@-ds.metric.gstatic.com^", skip_exceptional_rules=False | |
| ) | |
| self.assertEqual(domain, "ds.metric.gstatic.com") | |
| self.assertIsNone(ip) | |
| def test_single_pipe_patterns(self): | |
| """Test |domain patterns""" | |
| domain, ip = cf.extract_domain_ip("|piwik.") | |
| self.assertEqual(domain, "piwik") | |
| self.assertIsNone(ip) | |
| domain, ip = cf.extract_domain_ip("|167.206.10.148^") | |
| self.assertIsNone(domain) | |
| self.assertEqual(ip, "167.206.10.148") | |
| domain, ip = cf.extract_domain_ip("|load.gtm.") | |
| self.assertEqual(domain, "load.gtm") | |
| self.assertIsNone(ip) | |
| def test_dot_prefix_patterns(self): | |
| """Test .domain^ patterns""" | |
| domain, ip = cf.extract_domain_ip(".doublepimp.com^") | |
| self.assertEqual(domain, "doublepimp.com") | |
| self.assertIsNone(ip) | |
| domain, ip = cf.extract_domain_ip(".tvp.tv^") | |
| self.assertEqual(domain, "tvp.tv") | |
| self.assertIsNone(ip) | |
| domain, ip = cf.extract_domain_ip(".ay.delivery^") | |
| self.assertEqual(domain, "ay.delivery") | |
| self.assertIsNone(ip) | |
| def test_protocol_patterns(self): | |
| """Test ://domain^ patterns""" | |
| domain, ip = cf.extract_domain_ip("://mine.torrent.pw^") | |
| self.assertEqual(domain, "mine.torrent.pw") | |
| self.assertIsNone(ip) | |
| domain, ip = cf.extract_domain_ip("://tru.am^") | |
| self.assertEqual(domain, "tru.am") | |
| self.assertIsNone(ip) | |
| domain, ip = cf.extract_domain_ip("://say.ac^") | |
| self.assertEqual(domain, "say.ac") | |
| self.assertIsNone(ip) | |
| def test_dash_prefix_patterns(self): | |
| """Test -domain^ patterns""" | |
| domain, ip = cf.extract_domain_ip("-adx-*.rayjump.com^") | |
| self.assertEqual( | |
| domain, "adx.rayjump.com" | |
| ) # Leading dash removed, -* becomes . | |
| self.assertIsNone(ip) | |
| domain, ip = cf.extract_domain_ip("-s2s.sensic.net^") | |
| self.assertEqual(domain, "s2s.sensic.net") | |
| self.assertIsNone(ip) | |
| domain, ip = cf.extract_domain_ip("-tracker.biliapi.net^") | |
| self.assertEqual(domain, "tracker.biliapi.net") | |
| self.assertIsNone(ip) | |
| def test_plain_domain_patterns(self): | |
| """Test plain domain^ patterns""" | |
| domain, ip = cf.extract_domain_ip("vkcdnservice.appspot.com^") | |
| self.assertEqual(domain, "vkcdnservice.appspot.com") | |
| self.assertIsNone(ip) | |
| domain, ip = cf.extract_domain_ip("prd.api.bleacherreport.com^") | |
| self.assertEqual(domain, "prd.api.bleacherreport.com") | |
| self.assertIsNone(ip) | |
| def test_ip_addresses(self): | |
| """Test IP address extraction""" | |
| domain, ip = cf.extract_domain_ip("188.72.219.36^") | |
| self.assertIsNone(domain) | |
| self.assertEqual(ip, "188.72.219.36") | |
| domain, ip = cf.extract_domain_ip("193.200.65.61^") | |
| self.assertIsNone(domain) | |
| self.assertEqual(ip, "193.200.65.61") | |
| domain, ip = cf.extract_domain_ip("62.76.25.27^") | |
| self.assertIsNone(domain) | |
| self.assertEqual(ip, "62.76.25.27") | |
| domain, ip = cf.extract_domain_ip("|167.206.10.148^") | |
| self.assertIsNone(domain) | |
| self.assertEqual(ip, "167.206.10.148") | |
| def test_comments_skipped(self): | |
| """Test that comments are skipped""" | |
| domain, ip = cf.extract_domain_ip("! This is a comment") | |
| self.assertIsNone(domain) | |
| self.assertIsNone(ip) | |
| domain, ip = cf.extract_domain_ip("# Another comment") | |
| self.assertIsNone(domain) | |
| self.assertIsNone(ip) | |
| def test_regex_patterns_skipped(self): | |
| """Test that regex patterns are skipped""" | |
| domain, ip = cf.extract_domain_ip("/^139\\.45\\.197\\.2(4[0-9]|5[0-4]):/") | |
| self.assertIsNone(domain) | |
| self.assertIsNone(ip) | |
| domain, ip = cf.extract_domain_ip("/^94\\.242\\.247\\.(2[0-9]|3[0-2])/") | |
| self.assertIsNone(domain) | |
| self.assertIsNone(ip) | |
| def test_modifiers_skipped(self): | |
| """Test that lines with $ modifiers are skipped""" | |
| domain, ip = cf.extract_domain_ip("||example.com^$third-party") | |
| self.assertIsNone(domain) | |
| self.assertIsNone(ip) | |
| domain, ip = cf.extract_domain_ip("||bet.championat.com^$important") | |
| self.assertIsNone(domain) | |
| self.assertIsNone(ip) | |
| def test_empty_lines(self): | |
| """Test that empty lines return None""" | |
| domain, ip = cf.extract_domain_ip("") | |
| self.assertIsNone(domain) | |
| self.assertIsNone(ip) | |
| domain, ip = cf.extract_domain_ip(" ") | |
| self.assertIsNone(domain) | |
| self.assertIsNone(ip) | |
| def test_special_characters_filtered(self): | |
| """Test that domains with special characters are filtered""" | |
| # Domains with spaces should be rejected | |
| domain, ip = cf.extract_domain_ip("||example .com^") | |
| self.assertIsNone(domain) | |
| self.assertIsNone(ip) | |
| def test_case_normalization(self): | |
| """Test that domains are lowercase""" | |
| domain, ip = cf.extract_domain_ip("||ExAmPlE.COM^") | |
| self.assertEqual(domain, "example.com") | |
| self.assertIsNone(ip) | |
| domain, ip = cf.extract_domain_ip("||DOUBLECLICK.NET^") | |
| self.assertEqual(domain, "doubleclick.net") | |
| self.assertIsNone(ip) | |
| def test_case_dash(self): | |
| """Test that domains are with dash""" | |
| domain, ip = cf.extract_domain_ip("-sdk.rum.aliyuncs.com^") | |
| self.assertEqual(domain, "sdk.rum.aliyuncs.com") | |
| self.assertIsNone(ip) | |
| def test_invalid_domains_ending_with_dash(self): | |
| """Test that domains ending with dashes are rejected""" | |
| # Domain ending with dash should be invalid | |
| domain, ip = cf.extract_domain_ip("-ad123-") | |
| self.assertIsNone(domain) | |
| self.assertIsNone(ip) | |
| domain, ip = cf.extract_domain_ip("||test-^") | |
| self.assertIsNone(domain) | |
| self.assertIsNone(ip) | |
| # Valid domain with dash in middle should work | |
| domain, ip = cf.extract_domain_ip("||test-site.com^") | |
| self.assertEqual(domain, "test-site.com") | |
| self.assertIsNone(ip) | |
| def test_partial_ip_addresses_rejected(self): | |
| """Test that partial IP addresses are rejected""" | |
| # Partial IP (3 octets) should be invalid | |
| domain, ip = cf.extract_domain_ip("158.247.208") | |
| self.assertIsNone(domain) | |
| self.assertIsNone(ip) | |
| domain, ip = cf.extract_domain_ip("192.168.1") | |
| self.assertIsNone(domain) | |
| self.assertIsNone(ip) | |
| # Full valid IP should be extracted as IP | |
| domain, ip = cf.extract_domain_ip("192.168.1.1^") | |
| self.assertIsNone(domain) | |
| self.assertEqual(ip, "192.168.1.1") | |
| def main(): | |
| domainAndIPListUrls = [ | |
| "https://adguardteam.github.io/AdGuardSDNSFilter/Filters/filter.txt", | |
| "https://raw.githubusercontent.com/hagezi/dns-blocklists/main/adblock/pro.txt", | |
| "https://raw.githubusercontent.com/hagezi/dns-blocklists/main/adblock/ultimate.txt", | |
| ] | |
| for url in domainAndIPListUrls: | |
| # | |
| response = requests.get(url) | |
| response.raise_for_status() | |
| valid_domains = set() | |
| valid_ips = set() | |
| invalid_domains = set() | |
| for line in response.iter_lines(decode_unicode=True): | |
| # | |
| domain, ip = cf.extract_domain_ip(line) | |
| if domain and domain not in valid_domains: | |
| valid_domains.add(domain) | |
| if ip and ip not in valid_ips: | |
| valid_ips.add(ip) | |
| if not domain and not ip and line and line.strip(): | |
| # Only log non-empty lines that couldn't be parsed | |
| if not line.strip().startswith("!") and not line.strip().startswith( | |
| "#" | |
| ): | |
| invalid_domains.add(line) | |
| print(url, len(valid_domains), "domains extracted") | |
| print(url, len(valid_ips), "ips extracted") | |
| print(url, len(invalid_domains), "invalid lines") | |
| with open("_valid_domains_" + url.split("/")[-1], "w") as f: | |
| for domain in valid_domains: | |
| f.write(domain + "\n") | |
| with open("_invalid_domains_" + url.split("/")[-1], "w") as f: | |
| for domain in invalid_domains: | |
| f.write(domain + "\n") | |
| with open("_valid_ips_" + url.split("/")[-1], "w") as f: | |
| for ip in valid_ips: | |
| f.write(ip + "\n") | |
| if __name__ == "__main__": | |
| # Run the tests | |
| unittest.main(verbosity=2) | |
| # main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment