Python二进制文件编码探测工具

发表于2025-08-06由麦芽爸

背景
实现基于python语言cchardet库的二进制文件分析程序，按照预设分段参数对文件进行读取和cchardet的文本编码探测。脚本具备跳过文件头n字节，按照m字节分段二进制文件及分段后数据连续4字节探测功能。
结果输出会展示每段的序号，偏移起始，片内置信度识别偏移字节，片大小，编码方式，置信度，高置信度提示信息字段；

如何使用脚本：

# 1. 基本用法：分析整个文件
python encoding_detector.py myfile.bin

# 2. 指定块大小
python encoding_detector.py -s 512 myfile.bin

# 3. 跳过每个块的前 10 个字节
python encoding_detector.py -s 100 -h 10 myfile.bin

# 4. 从文件偏移 1116 开始分析
python encoding_detector.py -s 100 -o 1116 ../ftp-pcap/ftp-utf8-long.pcap

# 5. 结合使用：从偏移 1000 开始，每块 256 字节，跳过每块前 20 字节
python encoding_detector.py -s 256 -h 20 -o 1000 myfile.bin

# 6. 通过管道输入
cat myfile.bin | python encoding_detector.py -s 512

Python脚本是实现

#!/usr/bin/env python3
import cchardet
import sys
import os

def print_hex(data, width=16):
    """以十六进制和ASCII形式打印字节数据"""
    for i in range(0, len(data), width):
        # 十六进制部分
        hex_part = ' '.join(f'{byte:02x}' for byte in data[i:i+width])
        # ASCII部分 (可打印字符或'.')
        ascii_part = ''.join(chr(byte) if 32 <= byte <= 126 else '.' for byte in data[i:i+width])
        # 打印地址偏移、十六进制和ASCII
        print(f'{i:08x}: {hex_part:<{width*3}} |{ascii_part}|')

def detect_chunks_from_file(filename, chunk_size=1024, from_head_bytes=0, from_file_offset=0):
    """
    将文件按指定大小切块，并对每个块进行编码检测。
    如果检测置信度为0，则尝试偏移1-4字节重新检测。
    from_file_offset: 从文件的哪个字节偏移开始读取。
    """
    if not os.path.exists(filename):
        print(f"Error: File '{filename}' does not exist.", file=sys.stderr)
        return

    try:
        file_size = os.path.getsize(filename)
        print(f"Analyzing file: {filename} (Total size: {file_size} bytes)")
        print(f"Chunk size: {chunk_size} bytes")
        if from_head_bytes > 0:
            print(f"Skipping first {from_head_bytes} bytes of each chunk for detection.")
        if from_file_offset > 0:
            print(f"Starting analysis from file offset: {from_file_offset}")
        print("-" * 50)

        with open(filename, 'rb') as f:
            # 定位到文件的起始偏移
            if from_file_offset > 0:
                f.seek(from_file_offset)
            
            chunk_number = 0
            while True:
                chunk_data = f.read(chunk_size)
                if not chunk_
                    break

                # 计算当前块在原始文件中的基础偏移量
                offset = from_file_offset + chunk_number * chunk_size

                # 裁剪用于检测的数据（跳过头部字节）
                detection_data = chunk_data[from_head_bytes:] if len(chunk_data) > from_head_bytes else b''

                # --- 初始检测 ---
                encoding = None
                confidence = 0.0
                offset_by_used = 0 # 记录最终使用的偏移量

                if len(detection_data) > 0:
                    try:
                        result = cchardet.detect(detection_data)
                        if isinstance(result, dict):
                            encoding = result.get('encoding')
                            temp_confidence = result.get('confidence')
                            if temp_confidence is None:
                                confidence = 0.0
                            else:
                                confidence = temp_confidence
                            
                            if encoding is not None and not isinstance(encoding, str):
                                print(f"Warning: Unexpected encoding type in chunk {chunk_number}: {type(encoding)}", file=sys.stderr)
                                encoding = str(encoding) if encoding is not None else None
                        else:
                            print(f"Warning: cchardet returned unexpected type in chunk {chunk_number}: {type(result)}", file=sys.stderr)
                    except Exception as e:
                        print(f"Warning: cchardet failed on chunk {chunk_number}: {e}", file=sys.stderr)
                        encoding = "Error"
                        confidence = 0.0

                # --- 偏移优化逻辑 ---
                max_offset_attempts = 4
                if confidence == 0.0 and len(detection_data) > max_offset_attempts:
                    for offset_by in range(1, max_offset_attempts + 1):
                        if len(detection_data) > offset_by:
                            adjusted_detection_data = detection_data[offset_by:]
                            if len(adjusted_detection_data) > 0:
                                try:
                                    adjusted_result = cchardet.detect(adjusted_detection_data)
                                    if isinstance(adjusted_result, dict):
                                        adjusted_confidence = adjusted_result.get('confidence')
                                        if adjusted_confidence is None:
                                            adjusted_confidence = 0.0
                                        
                                        if adjusted_confidence > confidence:
                                            encoding = adjusted_result.get('encoding')
                                            confidence = adjusted_confidence
                                            offset_by_used = offset_by # 记录使用的偏移量
                                            
                                            if confidence > 0.0:
                                                break
                                except Exception:
                                    pass
                        else:
                            break

                # --- 格式化输出 ---
                encoding_display = encoding if encoding is not None else "N/A"
                output_line = (f"Chunk {chunk_number:4d} | Offset {offset:8d} | "
                               f"offset_by {offset_by_used:2d} | "
                               f"Size {len(chunk_data):4d} | "
                               f"Encoding: {encoding_display:>12} | "
                               f"Confidence: {confidence:6.4f}")
                
                # 可以根据置信度调整输出格式，例如高亮高置信度结果
                if confidence >= 0.75:
                     print(output_line) # 或用不同颜色/符号标记，这里简化为普通打印
                else:
                     print(output_line)

                # 如果置信度为0，可以选择打印数据内容（当前被注释掉）
                # if confidence == 0.0 and len(chunk_data) > 0:
                #     print ("\n")
                #     print_hex(chunk_data)
                #     print ("\n")
                    
                chunk_number += 1

            # 文件读取结束后的检查
            # f.tell() 在 seek 后返回的是绝对位置
            absolute_tell = f.tell()
            if absolute_tell < file_size:
                 print(f"Warning: Stopped reading before end of file '{filename}'. "
                      f"Read up to file offset {absolute_tell} bytes out of {file_size} bytes.", file=sys.stderr)

    except IOError as e:
        print(f"Error reading file '{filename}': {e}", file=sys.stderr)
    except Exception as e:
        print(f"An unexpected error occurred while processing '{filename}': {e}", file=sys.stderr)
    
    print("-" * 50 + f" Analysis of '{filename}' finished. " + "-" * 10 + "\n")


def detect_chunks_from_bytes(data, source_name="Byte Input", chunk_size=1024, from_head_bytes=0):
    """
    将字节数据按指定大小切块，并对每个块进行编码检测。
    如果检测置信度为0，则尝试偏移1-3字节重新检测。
    """
    data_len = len(data)
    print(f"Analyzing data from: {source_name} (Total size: {data_len} bytes)")
    print(f"Chunk size: {chunk_size} bytes")
    if from_head_bytes > 0:
        print(f"Skipping first {from_head_bytes} bytes of each chunk for detection.")
    print("-" * 50)

    if data_len == 0:
        print("Input data is empty.")
        return

    chunk_number = 0
    for i in range(0, data_len, chunk_size):
        chunk_data = data[i:i + chunk_size]
        if not chunk_
            break

        offset = i
        detection_data = chunk_data[from_head_bytes:] if len(chunk_data) > from_head_bytes else b''

        encoding = None
        confidence = 0.0
        
        if len(detection_data) > 0:
            try:
                result = cchardet.detect(detection_data)
                if isinstance(result, dict):
                    encoding = result.get('encoding')
                    temp_confidence = result.get('confidence')
                    if temp_confidence is None:
                        confidence = 0.0
                    else:
                        confidence = temp_confidence
                    
                    if encoding is not None and not isinstance(encoding, str):
                        print(f"Warning: Unexpected encoding type in chunk {chunk_number}: {type(encoding)}", file=sys.stderr)
                        encoding = str(encoding) if encoding is not None else None
                else:
                    print(f"Warning: cchardet returned unexpected type in chunk {chunk_number}: {type(result)}", file=sys.stderr)
            except Exception as e:
                print(f"Warning: cchardet failed on chunk {chunk_number}: {e}", file=sys.stderr)
                encoding = "Error"
                confidence = 0.0

        # --- 偏移优化逻辑 (针对 bytes 输入)---
        max_offset_attempts = 3
        offset_by_used = 0
        if confidence == 0.0 and len(detection_data) > max_offset_attempts:
            for offset_by in range(1, max_offset_attempts + 1):
                if len(detection_data) > offset_by:
                    adjusted_detection_data = detection_data[offset_by:]
                    if len(adjusted_detection_data) > 0:
                        try:
                            adjusted_result = cchardet.detect(adjusted_detection_data)
                            if isinstance(adjusted_result, dict):
                                adjusted_confidence = adjusted_result.get('confidence')
                                if adjusted_confidence is None:
                                    adjusted_confidence = 0.0
                                
                                if adjusted_confidence > confidence:
                                    encoding = adjusted_result.get('encoding')
                                    confidence = adjusted_confidence
                                    offset_by_used = offset_by
                                    
                                    if confidence > 0.0:
                                        break
                        except Exception:
                            pass
                else:
                    break

        # 格式化输出 (bytes 输入也显示 offset_by)
        encoding_display = encoding if encoding is not None else "N/A"
        print(f"Chunk {chunk_number:4d} | Offset {offset:8d} | "
              f"offset_by {offset_by_used:2d} | " # 添加 offset_by 显示
              f"Size {len(chunk_data):4d} | "
              f"Encoding: {encoding_display:>12} | "
              f"Confidence: {confidence:6.4f}")

        # 如果置信度为0，打印数据内容
        # if confidence == 0.0 and len(chunk_data) > 0:
        #     print ("\n")
        #     print_hex(chunk_data)
        #     print ("\n")

        chunk_number += 1

    print("-" * 50 + f" Analysis of '{source_name}' finished. " + "-" * 10 + "\n")


def main():
    """
    主函数，处理命令行参数并调用相应的检测函数。
    """
    if len(sys.argv) < 2:
        print("No filename provided. Reading binary data from STDIN...", file=sys.stderr)
        try:
            data = sys.stdin.buffer.read()
            detect_chunks_from_bytes(data, source_name="STDIN", chunk_size=1024)
        except KeyboardInterrupt:
            print("\nInterrupted by user.", file=sys.stderr)
        except Exception as e:
            print(f"Error reading from STDIN: {e}", file=sys.stderr)
        sys.exit(0)

    # 默认参数
    chunk_size = 1024
    from_head_bytes = 0
    from_file_offset = 0 # 新增默认参数
    filenames = []

    # 解析命令行参数
    i = 1
    while i < len(sys.argv):
        if sys.argv[i] == '-s':
            if i + 1 < len(sys.argv):
                try:
                    chunk_size = int(sys.argv[i + 1])
                    if chunk_size <= 0:
                        raise ValueError("Chunk size must be positive.")
                    i += 2
                except ValueError as e:
                    print(f"Error: Invalid chunk size '-s {sys.argv[i + 1]}': {e}", file=sys.stderr)
                    sys.exit(1)
            else:
                print("Error: Option '-s' requires an argument.", file=sys.stderr)
                sys.exit(1)
        elif sys.argv[i] == '-h':
             if i + 1 < len(sys.argv):
                try:
                    from_head_bytes = int(sys.argv[i + 1])
                    if from_head_bytes < 0:
                        raise ValueError("Head bytes to skip must be non-negative.")
                    i += 2
                except ValueError as e:
                    print(f"Error: Invalid head bytes '-h {sys.argv[i + 1]}': {e}", file=sys.stderr)
                    sys.exit(1)
             else:
                print("Error: Option '-h' requires an argument.", file=sys.stderr)
                sys.exit(1)
        # --- 新增：解析 -o 参数 ---
        elif sys.argv[i] == '-o':
             if i + 1 < len(sys.argv):
                try:
                    from_file_offset = int(sys.argv[i + 1])
                    if from_file_offset < 0:
                        raise ValueError("File offset must be non-negative.")
                    i += 2
                except ValueError as e:
                    print(f"Error: Invalid file offset '-o {sys.argv[i + 1]}': {e}", file=sys.stderr)
                    sys.exit(1)
             else:
                print("Error: Option '-o' requires an argument.", file=sys.stderr)
                sys.exit(1)
        # --- 新增结束 ---
        else:
            filenames.append(sys.argv[i])
            i += 1

    if not filenames:
        print("Error: No filename provided.", file=sys.stderr)
        sys.exit(1)

    # 对每个提供的文件进行处理
    for filename in filenames:
        # --- 修改：传递 from_file_offset 参数 ---
        detect_chunks_from_file(filename, chunk_size, from_head_bytes, from_file_offset)


if __name__ == "__main__":
    main()

Post Views: 14

此条目发表在未分类分类目录。将固定链接加入收藏夹。

发表回复取消回复