爬虫练习-网页自动检索18禁网站

基本思路：

通过查询网(“https://site.ip138.com“) 获取大量域名，域名可能会有需要的网站，爬虫伪装访问网站，获取网页源码，通过字符匹配，判断是否包含18禁内容

注意，目前代码能检索的出来的，说明网站基本未做任何防爬措施，判断安全的并不一定就不是18禁网站（也许这种性质的网站希望爬虫爬到）

文件结构

代码，先运行1_get.py，再2_on.py,检索到的文件会保存到sex.py文件中

1_get.py

import requests
from bs4 import BeautifulSoup
import time

def fetch_domains(url, request_count=10, interval=5, file_path="domain_results.txt"):
    """
    从指定 URL 获取最新域名信息并追加保存到文件。
    
    :param url: 目标网站 URL
    :param request_count: 请求次数，None 表示无限循环
    :param interval: 请求间隔（秒）
    :param file_pathe: 结果保存的文件名
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
    }
    count = 0
    
    while request_count is None or count < request_count:
        response = requests.get(url, headers=headers)
        
        if response.status_code != 200:
            print(f"请求失败，状态码: {response.status_code}")
            time.sleep(interval)
            continue
        
        soup = BeautifulSoup(response.text, "html.parser")
        domain_list = [item.text for item in soup.select('.result1 .group ul:nth-of-type(1) li a')]
        
        with open(file_path, "a") as f:
            for domain in domain_list:
                f.write(domain + "\n")
        
        print(f"最新查询结果已追加到 {file_path}")
        count += 1
        time.sleep(interval)

def remove_duplicates(file_path="domain_results.txt"):
    """
    删除文件中的重复域名，并保持原有顺序。
    
    :param file_path: 要去重的文件路径
    """
    try:
        with open(file_path, "r") as f:
            lines = f.readlines()
        
        unique_domains = list(dict.fromkeys(line.strip() for line in lines))  # 保持顺序去重
        
        with open(file_path, "w") as f:
            for domain in unique_domains:
                f.write(domain + "\n")
        
        print(f"重复域名已删除，更新后的数据已保存到 {file_path}")
    except FileNotFoundError:
        print(f"文件 {file_path} 不存在。")

def remove_duplicates_2(file_path="domain_results.txt"):
    """
    删除文件中的重复域名，并保持原有顺序，同时移除无法访问的域名。
    
    :param file_path: 要去重和筛选的文件路径
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) Version/14.0 "
                      "Mobile/15E148 Safari/537.36"
    }
    
    try:
        with open(file_path, "r") as f:
            lines = f.readlines()
        
        unique_domains = list(dict.fromkeys(line.strip() for line in lines))  # 保持顺序去重
        valid_domains = []
        
        for domain in unique_domains:
            url = f"http://{domain}"
            try:
                response = requests.get(url, headers=headers, timeout=10)
                if response.status_code == 200:
                    valid_domains.append(domain)
                else:
                    print(f"删除无法访问的域名: {domain}, 状态码: {response.status_code}")
            except requests.RequestException:
                print(f"删除无法访问的域名: {domain}")
        
        with open(file_path, "w") as f:
            for domain in valid_domains:
                f.write(domain + "\n")
        
        print(f"重复域名已删除，无法访问的域名已移除，更新后的数据已保存到 {file_path}")
    except FileNotFoundError:
        print(f"文件 {file_path} 不存在。")

# 示例调用
if __name__ == "__main__":
    fetch_domains("https://site.ip138.com/", request_count=10, interval=5,file_path="data/test.txt")
    remove_duplicates_2(file_path="data/test.txt")

代码，
2_on.py

import requests
from bs4 import BeautifulSoup
import re

def is_adult_content(url):
    """
    伪装成手机设备访问指定网站，并判断是否包含成人内容。
    :param url: 目标网站 URL
    :return: 是否为成人内容（True/False）
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) Version/14.0 "
                      "Mobile/15E148 Safari/537.36"
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code != 200:
            print(f"访问失败: {url}, 状态码: {response.status_code}")
            return False
        
        response.encoding = response.apparent_encoding

        soup = BeautifulSoup(response.text, "html.parser")
        text_content = soup.get_text().lower()

        # with open("data/example.txt", "a", encoding="utf-8") as f:
        #     f.write(text_content)


        # 关键词列表（可以扩展）
        adult_keywords = ["porn", "sex", "xxx", "adult", "erotic", "nude", "hentai", "18+", "爱情", "女神","线路","澳门","vip","撸", "麻豆","天美","精东","糖心","有码","无码"]
        
        for keyword in adult_keywords:
            if keyword in text_content:
                
                return True
        
        return False
    
    except requests.RequestException as e:
        print(f"请求异常: {url}, 错误: {e}")
        return False


def check_domains(file_path="domain_results.txt"):
    """
    读取 domain_results.txt，检测其中的域名是否包含成人内容。
    :param file_path: 域名列表文件
    """
    try:
        with open(file_path, "r") as f:
            domains = [line.strip() for line in f if line.strip()]
        
        for domain in domains:
            full_url = f"http://{domain}"
            if is_adult_content(full_url):
                print(f"⚠️ 检测到成人内容: {domain}")
                with open("data/sex.txt", "a", encoding="utf-8") as f:
                    f.write(full_url+ "\n")
            else:
                print(f"✅ 安全: {domain}")
    
    except FileNotFoundError:
        print(f"文件 {file_path} 不存在。")

# 运行检测
test_file = "data/test.txt"
check_domains(test_file)

本文采用署名-非商业性使用-相同方式共享 4.0 国际许可协议，转载请注明出处。