背景
实现基于python语言cchardet库的二进制文件分析程序,按照预设分段参数对文件进行读取和cchardet的文本编码探测。脚本具备跳过文件头n字节,按照m字节分段二进制文件及分段后数据连续4字节探测功能。
结果输出会展示每段的序号,偏移起始,片内置信度识别偏移字节,片大小,编码方式,置信度,高置信度提示信息字段;
如何使用脚本:
# 1. 基本用法:分析整个文件
python encoding_detector.py myfile.bin
# 2. 指定块大小
python encoding_detector.py -s 512 myfile.bin
# 3. 跳过每个块的前 10 个字节
python encoding_detector.py -s 100 -h 10 myfile.bin
# 4. 从文件偏移 1116 开始分析
python encoding_detector.py -s 100 -o 1116 ../ftp-pcap/ftp-utf8-long.pcap
# 5. 结合使用:从偏移 1000 开始,每块 256 字节,跳过每块前 20 字节
python encoding_detector.py -s 256 -h 20 -o 1000 myfile.bin
# 6. 通过管道输入
cat myfile.bin | python encoding_detector.py -s 512
Python脚本是实现
#!/usr/bin/env python3
import cchardet
import sys
import os
def print_hex(data, width=16):
"""以十六进制和ASCII形式打印字节数据"""
for i in range(0, len(data), width):
# 十六进制部分
hex_part = ' '.join(f'{byte:02x}' for byte in data[i:i+width])
# ASCII部分 (可打印字符或'.')
ascii_part = ''.join(chr(byte) if 32 <= byte <= 126 else '.' for byte in data[i:i+width])
# 打印地址偏移、十六进制和ASCII
print(f'{i:08x}: {hex_part:<{width*3}} |{ascii_part}|')
def detect_chunks_from_file(filename, chunk_size=1024, from_head_bytes=0, from_file_offset=0):
"""
将文件按指定大小切块,并对每个块进行编码检测。
如果检测置信度为0,则尝试偏移1-4字节重新检测。
from_file_offset: 从文件的哪个字节偏移开始读取。
"""
if not os.path.exists(filename):
print(f"Error: File '{filename}' does not exist.", file=sys.stderr)
return
try:
file_size = os.path.getsize(filename)
print(f"Analyzing file: {filename} (Total size: {file_size} bytes)")
print(f"Chunk size: {chunk_size} bytes")
if from_head_bytes > 0:
print(f"Skipping first {from_head_bytes} bytes of each chunk for detection.")
if from_file_offset > 0:
print(f"Starting analysis from file offset: {from_file_offset}")
print("-" * 50)
with open(filename, 'rb') as f:
# 定位到文件的起始偏移
if from_file_offset > 0:
f.seek(from_file_offset)
chunk_number = 0
while True:
chunk_data = f.read(chunk_size)
if not chunk_
break
# 计算当前块在原始文件中的基础偏移量
offset = from_file_offset + chunk_number * chunk_size
# 裁剪用于检测的数据(跳过头部字节)
detection_data = chunk_data[from_head_bytes:] if len(chunk_data) > from_head_bytes else b''
# --- 初始检测 ---
encoding = None
confidence = 0.0
offset_by_used = 0 # 记录最终使用的偏移量
if len(detection_data) > 0:
try:
result = cchardet.detect(detection_data)
if isinstance(result, dict):
encoding = result.get('encoding')
temp_confidence = result.get('confidence')
if temp_confidence is None:
confidence = 0.0
else:
confidence = temp_confidence
if encoding is not None and not isinstance(encoding, str):
print(f"Warning: Unexpected encoding type in chunk {chunk_number}: {type(encoding)}", file=sys.stderr)
encoding = str(encoding) if encoding is not None else None
else:
print(f"Warning: cchardet returned unexpected type in chunk {chunk_number}: {type(result)}", file=sys.stderr)
except Exception as e:
print(f"Warning: cchardet failed on chunk {chunk_number}: {e}", file=sys.stderr)
encoding = "Error"
confidence = 0.0
# --- 偏移优化逻辑 ---
max_offset_attempts = 4
if confidence == 0.0 and len(detection_data) > max_offset_attempts:
for offset_by in range(1, max_offset_attempts + 1):
if len(detection_data) > offset_by:
adjusted_detection_data = detection_data[offset_by:]
if len(adjusted_detection_data) > 0:
try:
adjusted_result = cchardet.detect(adjusted_detection_data)
if isinstance(adjusted_result, dict):
adjusted_confidence = adjusted_result.get('confidence')
if adjusted_confidence is None:
adjusted_confidence = 0.0
if adjusted_confidence > confidence:
encoding = adjusted_result.get('encoding')
confidence = adjusted_confidence
offset_by_used = offset_by # 记录使用的偏移量
if confidence > 0.0:
break
except Exception:
pass
else:
break
# --- 格式化输出 ---
encoding_display = encoding if encoding is not None else "N/A"
output_line = (f"Chunk {chunk_number:4d} | Offset {offset:8d} | "
f"offset_by {offset_by_used:2d} | "
f"Size {len(chunk_data):4d} | "
f"Encoding: {encoding_display:>12} | "
f"Confidence: {confidence:6.4f}")
# 可以根据置信度调整输出格式,例如高亮高置信度结果
if confidence >= 0.75:
print(output_line) # 或用不同颜色/符号标记,这里简化为普通打印
else:
print(output_line)
# 如果置信度为0,可以选择打印数据内容(当前被注释掉)
# if confidence == 0.0 and len(chunk_data) > 0:
# print ("\n")
# print_hex(chunk_data)
# print ("\n")
chunk_number += 1
# 文件读取结束后的检查
# f.tell() 在 seek 后返回的是绝对位置
absolute_tell = f.tell()
if absolute_tell < file_size:
print(f"Warning: Stopped reading before end of file '{filename}'. "
f"Read up to file offset {absolute_tell} bytes out of {file_size} bytes.", file=sys.stderr)
except IOError as e:
print(f"Error reading file '{filename}': {e}", file=sys.stderr)
except Exception as e:
print(f"An unexpected error occurred while processing '{filename}': {e}", file=sys.stderr)
print("-" * 50 + f" Analysis of '{filename}' finished. " + "-" * 10 + "\n")
def detect_chunks_from_bytes(data, source_name="Byte Input", chunk_size=1024, from_head_bytes=0):
"""
将字节数据按指定大小切块,并对每个块进行编码检测。
如果检测置信度为0,则尝试偏移1-3字节重新检测。
"""
data_len = len(data)
print(f"Analyzing data from: {source_name} (Total size: {data_len} bytes)")
print(f"Chunk size: {chunk_size} bytes")
if from_head_bytes > 0:
print(f"Skipping first {from_head_bytes} bytes of each chunk for detection.")
print("-" * 50)
if data_len == 0:
print("Input data is empty.")
return
chunk_number = 0
for i in range(0, data_len, chunk_size):
chunk_data = data[i:i + chunk_size]
if not chunk_
break
offset = i
detection_data = chunk_data[from_head_bytes:] if len(chunk_data) > from_head_bytes else b''
encoding = None
confidence = 0.0
if len(detection_data) > 0:
try:
result = cchardet.detect(detection_data)
if isinstance(result, dict):
encoding = result.get('encoding')
temp_confidence = result.get('confidence')
if temp_confidence is None:
confidence = 0.0
else:
confidence = temp_confidence
if encoding is not None and not isinstance(encoding, str):
print(f"Warning: Unexpected encoding type in chunk {chunk_number}: {type(encoding)}", file=sys.stderr)
encoding = str(encoding) if encoding is not None else None
else:
print(f"Warning: cchardet returned unexpected type in chunk {chunk_number}: {type(result)}", file=sys.stderr)
except Exception as e:
print(f"Warning: cchardet failed on chunk {chunk_number}: {e}", file=sys.stderr)
encoding = "Error"
confidence = 0.0
# --- 偏移优化逻辑 (针对 bytes 输入)---
max_offset_attempts = 3
offset_by_used = 0
if confidence == 0.0 and len(detection_data) > max_offset_attempts:
for offset_by in range(1, max_offset_attempts + 1):
if len(detection_data) > offset_by:
adjusted_detection_data = detection_data[offset_by:]
if len(adjusted_detection_data) > 0:
try:
adjusted_result = cchardet.detect(adjusted_detection_data)
if isinstance(adjusted_result, dict):
adjusted_confidence = adjusted_result.get('confidence')
if adjusted_confidence is None:
adjusted_confidence = 0.0
if adjusted_confidence > confidence:
encoding = adjusted_result.get('encoding')
confidence = adjusted_confidence
offset_by_used = offset_by
if confidence > 0.0:
break
except Exception:
pass
else:
break
# 格式化输出 (bytes 输入也显示 offset_by)
encoding_display = encoding if encoding is not None else "N/A"
print(f"Chunk {chunk_number:4d} | Offset {offset:8d} | "
f"offset_by {offset_by_used:2d} | " # 添加 offset_by 显示
f"Size {len(chunk_data):4d} | "
f"Encoding: {encoding_display:>12} | "
f"Confidence: {confidence:6.4f}")
# 如果置信度为0,打印数据内容
# if confidence == 0.0 and len(chunk_data) > 0:
# print ("\n")
# print_hex(chunk_data)
# print ("\n")
chunk_number += 1
print("-" * 50 + f" Analysis of '{source_name}' finished. " + "-" * 10 + "\n")
def main():
"""
主函数,处理命令行参数并调用相应的检测函数。
"""
if len(sys.argv) < 2:
print("No filename provided. Reading binary data from STDIN...", file=sys.stderr)
try:
data = sys.stdin.buffer.read()
detect_chunks_from_bytes(data, source_name="STDIN", chunk_size=1024)
except KeyboardInterrupt:
print("\nInterrupted by user.", file=sys.stderr)
except Exception as e:
print(f"Error reading from STDIN: {e}", file=sys.stderr)
sys.exit(0)
# 默认参数
chunk_size = 1024
from_head_bytes = 0
from_file_offset = 0 # 新增默认参数
filenames = []
# 解析命令行参数
i = 1
while i < len(sys.argv):
if sys.argv[i] == '-s':
if i + 1 < len(sys.argv):
try:
chunk_size = int(sys.argv[i + 1])
if chunk_size <= 0:
raise ValueError("Chunk size must be positive.")
i += 2
except ValueError as e:
print(f"Error: Invalid chunk size '-s {sys.argv[i + 1]}': {e}", file=sys.stderr)
sys.exit(1)
else:
print("Error: Option '-s' requires an argument.", file=sys.stderr)
sys.exit(1)
elif sys.argv[i] == '-h':
if i + 1 < len(sys.argv):
try:
from_head_bytes = int(sys.argv[i + 1])
if from_head_bytes < 0:
raise ValueError("Head bytes to skip must be non-negative.")
i += 2
except ValueError as e:
print(f"Error: Invalid head bytes '-h {sys.argv[i + 1]}': {e}", file=sys.stderr)
sys.exit(1)
else:
print("Error: Option '-h' requires an argument.", file=sys.stderr)
sys.exit(1)
# --- 新增:解析 -o 参数 ---
elif sys.argv[i] == '-o':
if i + 1 < len(sys.argv):
try:
from_file_offset = int(sys.argv[i + 1])
if from_file_offset < 0:
raise ValueError("File offset must be non-negative.")
i += 2
except ValueError as e:
print(f"Error: Invalid file offset '-o {sys.argv[i + 1]}': {e}", file=sys.stderr)
sys.exit(1)
else:
print("Error: Option '-o' requires an argument.", file=sys.stderr)
sys.exit(1)
# --- 新增结束 ---
else:
filenames.append(sys.argv[i])
i += 1
if not filenames:
print("Error: No filename provided.", file=sys.stderr)
sys.exit(1)
# 对每个提供的文件进行处理
for filename in filenames:
# --- 修改:传递 from_file_offset 参数 ---
detect_chunks_from_file(filename, chunk_size, from_head_bytes, from_file_offset)
if __name__ == "__main__":
main()