Posted on  Updated on 

爬虫练习-网页自动检索18禁网站

基本思路:

通过查询网(“https://site.ip138.com“) 获取大量域名,域名可能会有需要的网站,爬虫伪装访问网站,获取网页源码,通过字符匹配,判断是否包含18禁内容

注意,目前代码能检索的出来的,说明网站基本未做任何防爬措施,判断安全的并不一定就不是18禁网站(也许这种性质的网站希望爬虫爬到)

文件结构

代码,先运行1_get.py,再2_on.py,检索到的文件会保存到sex.py文件中

1_get.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import requests
from bs4 import BeautifulSoup
import time

def fetch_domains(url, request_count=10, interval=5, file_path="domain_results.txt"):
"""
从指定 URL 获取最新域名信息并追加保存到文件。

:param url: 目标网站 URL
:param request_count: 请求次数,None 表示无限循环
:param interval: 请求间隔(秒)
:param file_pathe: 结果保存的文件名
"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}
count = 0

while request_count is None or count < request_count:
response = requests.get(url, headers=headers)

if response.status_code != 200:
print(f"请求失败,状态码: {response.status_code}")
time.sleep(interval)
continue

soup = BeautifulSoup(response.text, "html.parser")
domain_list = [item.text for item in soup.select('.result1 .group ul:nth-of-type(1) li a')]

with open(file_path, "a") as f:
for domain in domain_list:
f.write(domain + "\n")

print(f"最新查询结果已追加到 {file_path}")
count += 1
time.sleep(interval)

def remove_duplicates(file_path="domain_results.txt"):
"""
删除文件中的重复域名,并保持原有顺序。

:param file_path: 要去重的文件路径
"""
try:
with open(file_path, "r") as f:
lines = f.readlines()

unique_domains = list(dict.fromkeys(line.strip() for line in lines)) # 保持顺序去重

with open(file_path, "w") as f:
for domain in unique_domains:
f.write(domain + "\n")

print(f"重复域名已删除,更新后的数据已保存到 {file_path}")
except FileNotFoundError:
print(f"文件 {file_path} 不存在。")

def remove_duplicates_2(file_path="domain_results.txt"):
"""
删除文件中的重复域名,并保持原有顺序,同时移除无法访问的域名。

:param file_path: 要去重和筛选的文件路径
"""
headers = {
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) "
"AppleWebKit/537.36 (KHTML, like Gecko) Version/14.0 "
"Mobile/15E148 Safari/537.36"
}

try:
with open(file_path, "r") as f:
lines = f.readlines()

unique_domains = list(dict.fromkeys(line.strip() for line in lines)) # 保持顺序去重
valid_domains = []

for domain in unique_domains:
url = f"http://{domain}"
try:
response = requests.get(url, headers=headers, timeout=10)
if response.status_code == 200:
valid_domains.append(domain)
else:
print(f"删除无法访问的域名: {domain}, 状态码: {response.status_code}")
except requests.RequestException:
print(f"删除无法访问的域名: {domain}")

with open(file_path, "w") as f:
for domain in valid_domains:
f.write(domain + "\n")

print(f"重复域名已删除,无法访问的域名已移除,更新后的数据已保存到 {file_path}")
except FileNotFoundError:
print(f"文件 {file_path} 不存在。")

# 示例调用
if __name__ == "__main__":
fetch_domains("https://site.ip138.com/", request_count=10, interval=5,file_path="data/test.txt")
remove_duplicates_2(file_path="data/test.txt")

代码,
2_on.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import requests
from bs4 import BeautifulSoup
import re

def is_adult_content(url):
"""
伪装成手机设备访问指定网站,并判断是否包含成人内容。
:param url: 目标网站 URL
:return: 是否为成人内容(True/False)
"""
headers = {
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) "
"AppleWebKit/537.36 (KHTML, like Gecko) Version/14.0 "
"Mobile/15E148 Safari/537.36"
}

try:
response = requests.get(url, headers=headers, timeout=10)
if response.status_code != 200:
print(f"访问失败: {url}, 状态码: {response.status_code}")
return False

response.encoding = response.apparent_encoding

soup = BeautifulSoup(response.text, "html.parser")
text_content = soup.get_text().lower()

# with open("data/example.txt", "a", encoding="utf-8") as f:
# f.write(text_content)


# 关键词列表(可以扩展)
adult_keywords = ["porn", "sex", "xxx", "adult", "erotic", "nude", "hentai", "18+", "爱情", "女神","线路","澳门","vip","撸", "麻豆","天美","精东","糖心","有码","无码"]

for keyword in adult_keywords:
if keyword in text_content:

return True

return False

except requests.RequestException as e:
print(f"请求异常: {url}, 错误: {e}")
return False


def check_domains(file_path="domain_results.txt"):
"""
读取 domain_results.txt,检测其中的域名是否包含成人内容。
:param file_path: 域名列表文件
"""
try:
with open(file_path, "r") as f:
domains = [line.strip() for line in f if line.strip()]

for domain in domains:
full_url = f"http://{domain}"
if is_adult_content(full_url):
print(f"⚠️ 检测到成人内容: {domain}")
with open("data/sex.txt", "a", encoding="utf-8") as f:
f.write(full_url+ "\n")
else:
print(f"✅ 安全: {domain}")

except FileNotFoundError:
print(f"文件 {file_path} 不存在。")

# 运行检测
test_file = "data/test.txt"
check_domains(test_file)