Skip to content

Instantly share code, notes, and snippets.

@aslamanver
Last active February 23, 2026 11:16
Show Gist options
  • Select an option

  • Save aslamanver/5655ef2ab00c717f1695b45b946eddcb to your computer and use it in GitHub Desktop.

Select an option

Save aslamanver/5655ef2ab00c717f1695b45b946eddcb to your computer and use it in GitHub Desktop.
Extract domains and IPs from various AdBlock Plus filter formats
import re
def extract_domain_ip(line, skip_exceptional_rules=True):
"""
Extract domains and IPs from various AdBlock Plus filter formats
Args:
line (str): Line to extract domain or ip from
skip_exceptional_rules (bool): Whether to skip exceptional rules
Returns:
tuple: (domain, ip) if valid, None if line is invalid
"""
# Skip empty lines
if not line or not line.strip():
return (None, None)
line = line.strip()
# Skip comments (! or #)
if line.startswith("!") or line.startswith("#"):
return (None, None)
# Skip regex patterns
if line.startswith("/") and line.endswith("/"):
return (None, None)
# Skip lines with modifiers ($)
if "$" in line:
return (None, None)
domain = None
# Skip ALL exception rules (any line starting with @@) if flag is True
if skip_exceptional_rules and line.startswith("@@"):
return (None, None)
# Handle exception rules (@@||, @@|, @@-) - only reached if skip_exceptional_rules=False
if line.startswith("@@||"):
domain = line[4:].split("^", 1)[0].split("|", 1)[0].strip()
elif line.startswith("@@|"):
domain = line[3:].split("^", 1)[0].split("|", 1)[0].strip()
elif line.startswith("@@-"):
domain = line[3:].split("^", 1)[0].strip()
# Handle standard block rules (||)
elif line.startswith("||"):
domain = line[2:].split("^", 1)[0].strip()
# Handle single pipe (|)
elif line.startswith("|"):
domain = line[1:].split("^", 1)[0].split("|", 1)[0].strip()
# Handle dot prefix (.)
elif line.startswith("."):
domain = line[1:].split("^", 1)[0].strip()
# Handle protocol (://)
elif line.startswith("://"):
domain = line[3:].split("^", 1)[0].strip()
# Handle dash prefix (-) - extract domain after dash
elif line.startswith("-"):
domain = line[1:].split("^", 1)[0].strip()
# Handle plain domain (might have ^ at end)
elif "^" in line and not line.startswith("/"):
domain = line.split("^", 1)[0].strip()
if not domain:
return (None, None)
# Clean up wildcards (including dash-wildcard combinations)
domain = (
domain.replace("-*.", ".").replace("*.", "").replace("-*", "").replace("*", "")
)
domain = domain.lower()
# Remove any trailing special characters
domain = domain.rstrip(".|^")
# Skip if empty after cleanup
if not domain:
return (None, None)
# Skip if contains spaces or special URL characters (likely not a clean domain)
if " " in domain or "?" in domain or "&" in domain:
return (None, None)
# Skip if domain starts or ends with dash or dot (invalid domain)
if domain.startswith("-") or domain.endswith("-"):
return (None, None)
if domain.startswith(".") or domain.endswith("."):
return (None, None)
# Check for valid IP addresses first (full 4-octet IPs)
if re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", domain):
return (None, domain)
# Skip if domain is only digits and dots (partial IP or invalid pattern like 158.247.208)
if re.match(r"^[\d.]+$", domain):
return (None, None)
# Return as valid domain
return (domain, None)
import unittest
import cf
import requests
class TestExtractDomainAndIP(unittest.TestCase):
"""Test cases for extract_domain_ip function"""
def test_standard_block_rules(self):
"""Test ||domain^ format"""
domain, ip = cf.extract_domain_ip("||doubleclick.net^")
self.assertEqual(domain, "doubleclick.net")
self.assertIsNone(ip)
domain, ip = cf.extract_domain_ip("||example.com^")
self.assertEqual(domain, "example.com")
self.assertIsNone(ip)
def test_wildcard_removal(self):
"""Test wildcard pattern removal"""
# Test *.domain.com
domain, ip = cf.extract_domain_ip("||*.example.com^")
self.assertEqual(domain, "example.com")
self.assertIsNone(ip)
# Test domain*.com (wildcard between letters gets removed without dot)
domain, ip = cf.extract_domain_ip("||putrr*.com^")
self.assertEqual(domain, "putrrcom") # * is removed, leaving putrrcom
self.assertIsNone(ip)
# Test reachableads-av.*.amazonaws.com
domain, ip = cf.extract_domain_ip("||reachableads-av.*.amazonaws.com^")
self.assertEqual(domain, "reachableads-av.amazonaws.com")
self.assertIsNone(ip)
def test_exception_rules_skipped(self):
"""Test @@ exception patterns are skipped"""
# @@|| pattern should be skipped
domain, ip = cf.extract_domain_ip("@@||ad.10010.com^")
self.assertIsNone(domain)
self.assertIsNone(ip)
# @@| pattern should be skipped
domain, ip = cf.extract_domain_ip("@@|affiliate.notion.so^|")
self.assertIsNone(domain)
self.assertIsNone(ip)
# @@- pattern should be skipped (originally reported issue)
domain, ip = cf.extract_domain_ip("@@-ds.metric.gstatic.com^")
self.assertIsNone(domain)
self.assertIsNone(ip)
def test_exception_rules_extracted_when_enabled(self):
"""Test @@ exception patterns are extracted when skip_exceptional_rules=False"""
# @@|| pattern should extract domain
domain, ip = cf.extract_domain_ip(
"@@||ad.10010.com^", skip_exceptional_rules=False
)
self.assertEqual(domain, "ad.10010.com")
self.assertIsNone(ip)
# @@| pattern should extract domain
domain, ip = cf.extract_domain_ip(
"@@|affiliate.notion.so^|", skip_exceptional_rules=False
)
self.assertEqual(domain, "affiliate.notion.so")
self.assertIsNone(ip)
# @@- pattern should extract domain (after removing @@-)
domain, ip = cf.extract_domain_ip(
"@@-ds.metric.gstatic.com^", skip_exceptional_rules=False
)
self.assertEqual(domain, "ds.metric.gstatic.com")
self.assertIsNone(ip)
def test_single_pipe_patterns(self):
"""Test |domain patterns"""
domain, ip = cf.extract_domain_ip("|piwik.")
self.assertEqual(domain, "piwik")
self.assertIsNone(ip)
domain, ip = cf.extract_domain_ip("|167.206.10.148^")
self.assertIsNone(domain)
self.assertEqual(ip, "167.206.10.148")
domain, ip = cf.extract_domain_ip("|load.gtm.")
self.assertEqual(domain, "load.gtm")
self.assertIsNone(ip)
def test_dot_prefix_patterns(self):
"""Test .domain^ patterns"""
domain, ip = cf.extract_domain_ip(".doublepimp.com^")
self.assertEqual(domain, "doublepimp.com")
self.assertIsNone(ip)
domain, ip = cf.extract_domain_ip(".tvp.tv^")
self.assertEqual(domain, "tvp.tv")
self.assertIsNone(ip)
domain, ip = cf.extract_domain_ip(".ay.delivery^")
self.assertEqual(domain, "ay.delivery")
self.assertIsNone(ip)
def test_protocol_patterns(self):
"""Test ://domain^ patterns"""
domain, ip = cf.extract_domain_ip("://mine.torrent.pw^")
self.assertEqual(domain, "mine.torrent.pw")
self.assertIsNone(ip)
domain, ip = cf.extract_domain_ip("://tru.am^")
self.assertEqual(domain, "tru.am")
self.assertIsNone(ip)
domain, ip = cf.extract_domain_ip("://say.ac^")
self.assertEqual(domain, "say.ac")
self.assertIsNone(ip)
def test_dash_prefix_patterns(self):
"""Test -domain^ patterns"""
domain, ip = cf.extract_domain_ip("-adx-*.rayjump.com^")
self.assertEqual(
domain, "adx.rayjump.com"
) # Leading dash removed, -* becomes .
self.assertIsNone(ip)
domain, ip = cf.extract_domain_ip("-s2s.sensic.net^")
self.assertEqual(domain, "s2s.sensic.net")
self.assertIsNone(ip)
domain, ip = cf.extract_domain_ip("-tracker.biliapi.net^")
self.assertEqual(domain, "tracker.biliapi.net")
self.assertIsNone(ip)
def test_plain_domain_patterns(self):
"""Test plain domain^ patterns"""
domain, ip = cf.extract_domain_ip("vkcdnservice.appspot.com^")
self.assertEqual(domain, "vkcdnservice.appspot.com")
self.assertIsNone(ip)
domain, ip = cf.extract_domain_ip("prd.api.bleacherreport.com^")
self.assertEqual(domain, "prd.api.bleacherreport.com")
self.assertIsNone(ip)
def test_ip_addresses(self):
"""Test IP address extraction"""
domain, ip = cf.extract_domain_ip("188.72.219.36^")
self.assertIsNone(domain)
self.assertEqual(ip, "188.72.219.36")
domain, ip = cf.extract_domain_ip("193.200.65.61^")
self.assertIsNone(domain)
self.assertEqual(ip, "193.200.65.61")
domain, ip = cf.extract_domain_ip("62.76.25.27^")
self.assertIsNone(domain)
self.assertEqual(ip, "62.76.25.27")
domain, ip = cf.extract_domain_ip("|167.206.10.148^")
self.assertIsNone(domain)
self.assertEqual(ip, "167.206.10.148")
def test_comments_skipped(self):
"""Test that comments are skipped"""
domain, ip = cf.extract_domain_ip("! This is a comment")
self.assertIsNone(domain)
self.assertIsNone(ip)
domain, ip = cf.extract_domain_ip("# Another comment")
self.assertIsNone(domain)
self.assertIsNone(ip)
def test_regex_patterns_skipped(self):
"""Test that regex patterns are skipped"""
domain, ip = cf.extract_domain_ip("/^139\\.45\\.197\\.2(4[0-9]|5[0-4]):/")
self.assertIsNone(domain)
self.assertIsNone(ip)
domain, ip = cf.extract_domain_ip("/^94\\.242\\.247\\.(2[0-9]|3[0-2])/")
self.assertIsNone(domain)
self.assertIsNone(ip)
def test_modifiers_skipped(self):
"""Test that lines with $ modifiers are skipped"""
domain, ip = cf.extract_domain_ip("||example.com^$third-party")
self.assertIsNone(domain)
self.assertIsNone(ip)
domain, ip = cf.extract_domain_ip("||bet.championat.com^$important")
self.assertIsNone(domain)
self.assertIsNone(ip)
def test_empty_lines(self):
"""Test that empty lines return None"""
domain, ip = cf.extract_domain_ip("")
self.assertIsNone(domain)
self.assertIsNone(ip)
domain, ip = cf.extract_domain_ip(" ")
self.assertIsNone(domain)
self.assertIsNone(ip)
def test_special_characters_filtered(self):
"""Test that domains with special characters are filtered"""
# Domains with spaces should be rejected
domain, ip = cf.extract_domain_ip("||example .com^")
self.assertIsNone(domain)
self.assertIsNone(ip)
def test_case_normalization(self):
"""Test that domains are lowercase"""
domain, ip = cf.extract_domain_ip("||ExAmPlE.COM^")
self.assertEqual(domain, "example.com")
self.assertIsNone(ip)
domain, ip = cf.extract_domain_ip("||DOUBLECLICK.NET^")
self.assertEqual(domain, "doubleclick.net")
self.assertIsNone(ip)
def test_case_dash(self):
"""Test that domains are with dash"""
domain, ip = cf.extract_domain_ip("-sdk.rum.aliyuncs.com^")
self.assertEqual(domain, "sdk.rum.aliyuncs.com")
self.assertIsNone(ip)
def test_invalid_domains_ending_with_dash(self):
"""Test that domains ending with dashes are rejected"""
# Domain ending with dash should be invalid
domain, ip = cf.extract_domain_ip("-ad123-")
self.assertIsNone(domain)
self.assertIsNone(ip)
domain, ip = cf.extract_domain_ip("||test-^")
self.assertIsNone(domain)
self.assertIsNone(ip)
# Valid domain with dash in middle should work
domain, ip = cf.extract_domain_ip("||test-site.com^")
self.assertEqual(domain, "test-site.com")
self.assertIsNone(ip)
def test_partial_ip_addresses_rejected(self):
"""Test that partial IP addresses are rejected"""
# Partial IP (3 octets) should be invalid
domain, ip = cf.extract_domain_ip("158.247.208")
self.assertIsNone(domain)
self.assertIsNone(ip)
domain, ip = cf.extract_domain_ip("192.168.1")
self.assertIsNone(domain)
self.assertIsNone(ip)
# Full valid IP should be extracted as IP
domain, ip = cf.extract_domain_ip("192.168.1.1^")
self.assertIsNone(domain)
self.assertEqual(ip, "192.168.1.1")
def main():
domainAndIPListUrls = [
"https://adguardteam.github.io/AdGuardSDNSFilter/Filters/filter.txt",
"https://raw.githubusercontent.com/hagezi/dns-blocklists/main/adblock/pro.txt",
"https://raw.githubusercontent.com/hagezi/dns-blocklists/main/adblock/ultimate.txt",
]
for url in domainAndIPListUrls:
#
response = requests.get(url)
response.raise_for_status()
valid_domains = set()
valid_ips = set()
invalid_domains = set()
for line in response.iter_lines(decode_unicode=True):
#
domain, ip = cf.extract_domain_ip(line)
if domain and domain not in valid_domains:
valid_domains.add(domain)
if ip and ip not in valid_ips:
valid_ips.add(ip)
if not domain and not ip and line and line.strip():
# Only log non-empty lines that couldn't be parsed
if not line.strip().startswith("!") and not line.strip().startswith(
"#"
):
invalid_domains.add(line)
print(url, len(valid_domains), "domains extracted")
print(url, len(valid_ips), "ips extracted")
print(url, len(invalid_domains), "invalid lines")
with open("_valid_domains_" + url.split("/")[-1], "w") as f:
for domain in valid_domains:
f.write(domain + "\n")
with open("_invalid_domains_" + url.split("/")[-1], "w") as f:
for domain in invalid_domains:
f.write(domain + "\n")
with open("_valid_ips_" + url.split("/")[-1], "w") as f:
for ip in valid_ips:
f.write(ip + "\n")
if __name__ == "__main__":
# Run the tests
unittest.main(verbosity=2)
# main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment