批量扫描网址存活及网址标题py版本
实现批量扫描网址存活然后输出到txt 此版本相比之前用php写的以及另一个py写的快的不知道多少万倍 那两个货 到现在还没任何结果
此版本几分钟就解决几千网址并给出结果 一下子解决了问题
此次我是要筛选出几千个gpt网站的存活情况 并取出存活中质量比较高的出来
import asyncio, aiohttp, time, threading, argparse, os
from colorama import init,Fore
init(autoreset=True)
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Mobile Safari/537.36'
}
version = "1.1"
def logo():
logo0 = r
'''
aiohttp argparse requests colorama
依赖装不上 升级pip也不行 单独装
pip install aiohttp
pip install aiodns
pip install colorama
pip install argparse
https://docs.aiohttp.org/en/stable/
再次测试 批量扫描网址存活
python D:\ENdir\p\python\CheckAlive.py -f C:\Users\c\chatgpturls.txt -x 301,302,403,404
python D:\ENdir\p\python\CheckAlive.py -x 404 -f C:\Users\c\chatgpturls.txt
此脚本导出的文件保存在当前用户根目录 C:\Users\c
______ _____________ ____
/ ___// ___/\_ __ \_/ ___\
\___ \ \___ \ | | \/\ \___
/____ >____ > |__| \___ >
\/ \/ \/
Version {}
By 山山而川'
'''
colored_logo = logo0.format(version)
colored_logo = colored_logo.replace("____", Fore.YELLOW + "____" + Fore.RESET)
print(colored_logo)
def usage():
print('''
用法:
web批量存活检测: python CheckAlive.py -f url.txt
参数:
-f --file url文件
-x --xcode 要排除的响应状态码,默认状态码为200, 301, 302, 403, 404
-a --addcode 要添加的状态码
-p -proxy 127.0.0.1:7890 使用代理 ''')
def get_parser():
parser = argparse.ArgumentParser(usage='python CheckAlive.py -f url.txt',
description='基于python3的web存活检测脚本'
)
p = parser.add_argument_group('参数')
p.add_argument("-f", "--file", type=str, help="urls所在txt")
p.add_argument("-x", "--xcode", type=str, help="要排除的响应码,如-x 404,即状态码为404也判定为不存活,多个参数用逗号分隔")
p.add_argument("-a", "--addcode", type=str, help="要增加的响应码,如-a 500,即状态码为500的也判定为存活,多个参数用逗号分隔")
p.add_argument("-p", "--proxy", type=str, help="代理,如-p 127.0.0.1:7890")
args = parser.parse_args()
if args.xcode:
args.xcode = args.xcode.split(',')
if args.addcode:
args.addcode = args.addcode.split(',')
return args
def check_urls(file, proxy, xcode, addcode):
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Mobile Safari/537.36'
}
codeList = [200, 301, 302, 403, 404]
if xcode: #删除状态码
for c in xcode:
if int(c) in codeList:
codeList.remove(int(c))
if addcode:
for d in addcode:
codeList.append(int(d))
if proxy:
proxy = "http://" + proxy
else:
proxy = None
if os.path.exists("alive.txt"):
os.remove("alive.txt")
if os.path.exists("die.txt"):
os.remove("die.txt")
try:
urls = [url.strip() if url.startswith("http://") or url.startswith("https://") else "http://" + url.strip() for url in open(file, "r", encoding="utf-8")]
except Exception as e:
print("打开%s文件失败:%s" %(file, e))
return
print(Fore.YELLOW+"[INFO]共%s个网站" % len(urls))
lock = threading.Lock() # 创建互斥锁
async def check_url(url, sem):
async with sem:
retry_count = 2 # 设置重试次数为1,包括初始请求的次数则为2
while retry_count > 0:
retry_count -= 1
try:
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=headers, ssl=False, timeout=18, allow_redirects=False, proxy=proxy) as response:
code = response.status
if code in codeList:
print(Fore.GREEN+"[+]响应码:%s %s 存活" %(code, url))
with lock:
with open('alive.txt', 'a', encoding='utf-8') as f:
f.write(url + '\n')
else:
print(Fore.RED+"[-]响应码:%s %s die" %(code, url))
with lock:
with open('die.txt', 'a', encoding='utf-8') as f:
f.write(url + '\n')
return
except (asyncio.TimeoutError, aiohttp.ClientConnectorError) as e:
if retry_count == 0:
print(Fore.RED+"[-]%s %s" %(url,repr(e)))
with lock:
with open('die.txt', 'a', encoding='utf-8') as f:
f.write(url + '\n')
else:
await asyncio.sleep(0.8) # 重试之前等待1秒
except Exception as e:
print(Fore.RED+"[-]请求异常:%s %s" %(url, repr(e)))
with lock:
with open('die.txt', 'a', encoding='utf-8') as f:
f.write(url + '\n')
return
async def main():
sem = asyncio.Semaphore(100) # 设置并发请求数量为100
tasks = []
for url in urls:
tasks.append(check_url(url, sem))
await asyncio.gather(*tasks)
start_time = time.time()
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
end_time = time.time()
elapsed_time = end_time - start_time
print(Fore.GREEN+"检测完成,耗时:%s 秒。存活url保存在alive.txt,不存活url保存在die.txt" % elapsed_time)
def main():
logo()
args = get_parser()
if args.file:
#print(args.file)
check_urls(args.file, args.proxy, args.xcode, args.addcode)
else:
usage() #如果没有输入任何参数则调用usage()
if __name__ == '__main__':
main()
这是用法
可以排除状态码 具体看看代码 还可以支持添加状态码 和使用代理
python D:\ENdir\p\python\CheckAlive.py -f C:\Users\c\chatgpturls.txt -x 301,302,403,404
python D:\ENdir\p\python\CheckAlive.py -x 404 -f C:\Users\c\chatgpturls.txt
alive.txt是存活网址列表文件
C:\Users\c
单独提取一下存活的网址标题
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
def get_title(url):
try:
response = requests.get(url)
response.encoding = response.apparent_encoding
soup = BeautifulSoup(response.content, 'html.parser')
title = soup.title.string.strip()
return url, title
except Exception as e:
return url, str(e)
def process_urls(filename):
urls = []
with open(filename, 'r', encoding='utf-8') as file:
for line in file:
url = line.strip()
urls.append(url)
results = []
with ThreadPoolExecutor(max_workers=10) as executor:
futures = []
for url in urls:
future = executor.submit(get_title, url)
futures.append(future)
for future in futures:
result = future.result()
results.append(result)
print_result(result)
return results
def print_result(result):
url, title = result
print(f"URL: {url}\nTitle: {title}\n")
with open('output.txt', 'a', encoding='utf-8') as file:
file.write(f"URL: {url}\nTitle: {title}\n\n")
filename = r'C:\Users\c\chatgpturls.txt'
results = process_urls(filename)
output.txt 是输出文件 默认在主目录下生成 C:\Users\c