1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
| import re import gzip import os from collections import defaultdict from datetime import datetime
LOG_FILE = 'easygif.cn_2025_07_07_000000_010000' THRESHOLD = 0 CHECK_4XX_5XX = True OUTPUT_FILE = 'abnormal_ips.txt'
log_pattern = re.compile( r'^\[(.*?)\]\s+' r'(\S+)\s+' r'-\s+' r'\d+\s+' r'"(.*?)"\s+' r'"(.*?)"\s+' r'(\d{3})\s+\d+\s+\S+\s+' r'\S+\s+' r'"(.*?)"\s+' r'"(.*?)"\s+' r'(\S+)' )
def extract_gz_files(directory): """解压目录下的所有 .gz 文件""" extracted_files = [] for filename in os.listdir(directory): if filename.endswith('.gz'): gz_path = os.path.join(directory, filename) extracted_path = gz_path[:-3] with gzip.open(gz_path, 'rt', encoding='utf-8') as gz_file: with open(extracted_path, 'w', encoding='utf-8') as extracted_file: extracted_file.write(gz_file.read()) extracted_files.append(extracted_path) return extracted_files
def parse_log(file_paths): """解析日志文件""" ip_requests = defaultdict(list) error_ips = set()
for file_path in file_paths: with open(file_path, 'r', encoding='utf-8') as f: for line in f: match = log_pattern.match(line.strip()) if not match: continue
timestamp_str, client_ip, website, request_line, status, user_agent, content_type, backend_ip = match.groups() timestamp = datetime.strptime(timestamp_str, "%d/%b/%Y:%H:%M:%S %z")
if client_ip == '-' or client_ip.startswith(('10.', '192.168.')): continue
ip_requests[client_ip].append(int(timestamp.timestamp()))
if CHECK_4XX_5XX and status.startswith(('4', '5')): error_ips.add(client_ip)
return ip_requests, error_ips
def detect_high_frequency(ip_requests): """检测高频访问IP""" top_ip_cnt = defaultdict(int) for ip, timestamps in ip_requests.items(): if len(timestamps) >= THRESHOLD: top_ip_cnt[ip] = len(timestamps) return {ip:count for ip, count in sorted(top_ip_cnt.items(), key=lambda item: item[1], reverse=True) if count >= THRESHOLD}
def main(): extracted_files = extract_gz_files(os.getcwd())
ip_requests, error_ips = parse_log(extracted_files) high_freq_ips = detect_high_frequency(ip_requests)
abnormal_ips = dict() abnormal_ips.update(high_freq_ips)
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f: for ip, count in abnormal_ips.items(): f.write(f"{ip} - {count}\n") print(f"Abnormal IP detected: {ip}")
print(f"\n✅ Total abnormal IPs found: {len(abnormal_ips)}") print(f"Saved to: {OUTPUT_FILE}")
if __name__ == '__main__': main()
|