1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314
| #!/usr/bin/env python3 import cchardet import sys import os
def print_hex(data, width=16): """以十六进制和ASCII形式打印字节数据""" for i in range(0, len(data), width): # 十六进制部分 hex_part = ' '.join(f'{byte:02x}' for byte in data[i:i+width]) # ASCII部分 (可打印字符或'.') ascii_part = ''.join(chr(byte) if 32 <= byte <= 126 else '.' for byte in data[i:i+width]) # 打印地址偏移、十六进制和ASCII print(f'{i:08x}: {hex_part:<{width*3}} |{ascii_part}|')
def detect_chunks_from_file(filename, chunk_size=1024, from_head_bytes=0, from_file_offset=0): """ 将文件按指定大小切块,并对每个块进行编码检测。 如果检测置信度为0,则尝试偏移1-4字节重新检测。 from_file_offset: 从文件的哪个字节偏移开始读取。 """ if not os.path.exists(filename): print(f"Error: File '{filename}' does not exist.", file=sys.stderr) return
try: file_size = os.path.getsize(filename) print(f"Analyzing file: {filename} (Total size: {file_size} bytes)") print(f"Chunk size: {chunk_size} bytes") if from_head_bytes > 0: print(f"Skipping first {from_head_bytes} bytes of each chunk for detection.") if from_file_offset > 0: print(f"Starting analysis from file offset: {from_file_offset}") print("-" * 50)
with open(filename, 'rb') as f: # 定位到文件的起始偏移 if from_file_offset > 0: f.seek(from_file_offset) chunk_number = 0 while True: chunk_data = f.read(chunk_size) if not chunk_ break
# 计算当前块在原始文件中的基础偏移量 offset = from_file_offset + chunk_number * chunk_size
# 裁剪用于检测的数据(跳过头部字节) detection_data = chunk_data[from_head_bytes:] if len(chunk_data) > from_head_bytes else b''
# --- 初始检测 --- encoding = None confidence = 0.0 offset_by_used = 0 # 记录最终使用的偏移量
if len(detection_data) > 0: try: result = cchardet.detect(detection_data) if isinstance(result, dict): encoding = result.get('encoding') temp_confidence = result.get('confidence') if temp_confidence is None: confidence = 0.0 else: confidence = temp_confidence if encoding is not None and not isinstance(encoding, str): print(f"Warning: Unexpected encoding type in chunk {chunk_number}: {type(encoding)}", file=sys.stderr) encoding = str(encoding) if encoding is not None else None else: print(f"Warning: cchardet returned unexpected type in chunk {chunk_number}: {type(result)}", file=sys.stderr) except Exception as e: print(f"Warning: cchardet failed on chunk {chunk_number}: {e}", file=sys.stderr) encoding = "Error" confidence = 0.0
# --- 偏移优化逻辑 --- max_offset_attempts = 4 if confidence == 0.0 and len(detection_data) > max_offset_attempts: for offset_by in range(1, max_offset_attempts + 1): if len(detection_data) > offset_by: adjusted_detection_data = detection_data[offset_by:] if len(adjusted_detection_data) > 0: try: adjusted_result = cchardet.detect(adjusted_detection_data) if isinstance(adjusted_result, dict): adjusted_confidence = adjusted_result.get('confidence') if adjusted_confidence is None: adjusted_confidence = 0.0 if adjusted_confidence > confidence: encoding = adjusted_result.get('encoding') confidence = adjusted_confidence offset_by_used = offset_by # 记录使用的偏移量 if confidence > 0.0: break except Exception: pass else: break
# --- 格式化输出 --- encoding_display = encoding if encoding is not None else "N/A" output_line = (f"Chunk {chunk_number:4d} | Offset {offset:8d} | " f"offset_by {offset_by_used:2d} | " f"Size {len(chunk_data):4d} | " f"Encoding: {encoding_display:>12} | " f"Confidence: {confidence:6.4f}") # 可以根据置信度调整输出格式,例如高亮高置信度结果 if confidence >= 0.75: print(output_line) # 或用不同颜色/符号标记,这里简化为普通打印 else: print(output_line)
# 如果置信度为0,可以选择打印数据内容(当前被注释掉) # if confidence == 0.0 and len(chunk_data) > 0: # print ("\n") # print_hex(chunk_data) # print ("\n") chunk_number += 1
# 文件读取结束后的检查 # f.tell() 在 seek 后返回的是绝对位置 absolute_tell = f.tell() if absolute_tell < file_size: print(f"Warning: Stopped reading before end of file '{filename}'. " f"Read up to file offset {absolute_tell} bytes out of {file_size} bytes.", file=sys.stderr)
except IOError as e: print(f"Error reading file '{filename}': {e}", file=sys.stderr) except Exception as e: print(f"An unexpected error occurred while processing '{filename}': {e}", file=sys.stderr) print("-" * 50 + f" Analysis of '{filename}' finished. " + "-" * 10 + "\n")
def detect_chunks_from_bytes(data, source_name="Byte Input", chunk_size=1024, from_head_bytes=0): """ 将字节数据按指定大小切块,并对每个块进行编码检测。 如果检测置信度为0,则尝试偏移1-3字节重新检测。 """ data_len = len(data) print(f"Analyzing data from: {source_name} (Total size: {data_len} bytes)") print(f"Chunk size: {chunk_size} bytes") if from_head_bytes > 0: print(f"Skipping first {from_head_bytes} bytes of each chunk for detection.") print("-" * 50)
if data_len == 0: print("Input data is empty.") return
chunk_number = 0 for i in range(0, data_len, chunk_size): chunk_data = data[i:i + chunk_size] if not chunk_ break
offset = i detection_data = chunk_data[from_head_bytes:] if len(chunk_data) > from_head_bytes else b''
encoding = None confidence = 0.0 if len(detection_data) > 0: try: result = cchardet.detect(detection_data) if isinstance(result, dict): encoding = result.get('encoding') temp_confidence = result.get('confidence') if temp_confidence is None: confidence = 0.0 else: confidence = temp_confidence if encoding is not None and not isinstance(encoding, str): print(f"Warning: Unexpected encoding type in chunk {chunk_number}: {type(encoding)}", file=sys.stderr) encoding = str(encoding) if encoding is not None else None else: print(f"Warning: cchardet returned unexpected type in chunk {chunk_number}: {type(result)}", file=sys.stderr) except Exception as e: print(f"Warning: cchardet failed on chunk {chunk_number}: {e}", file=sys.stderr) encoding = "Error" confidence = 0.0
# --- 偏移优化逻辑 (针对 bytes 输入)--- max_offset_attempts = 3 offset_by_used = 0 if confidence == 0.0 and len(detection_data) > max_offset_attempts: for offset_by in range(1, max_offset_attempts + 1): if len(detection_data) > offset_by: adjusted_detection_data = detection_data[offset_by:] if len(adjusted_detection_data) > 0: try: adjusted_result = cchardet.detect(adjusted_detection_data) if isinstance(adjusted_result, dict): adjusted_confidence = adjusted_result.get('confidence') if adjusted_confidence is None: adjusted_confidence = 0.0 if adjusted_confidence > confidence: encoding = adjusted_result.get('encoding') confidence = adjusted_confidence offset_by_used = offset_by if confidence > 0.0: break except Exception: pass else: break
# 格式化输出 (bytes 输入也显示 offset_by) encoding_display = encoding if encoding is not None else "N/A" print(f"Chunk {chunk_number:4d} | Offset {offset:8d} | " f"offset_by {offset_by_used:2d} | " # 添加 offset_by 显示 f"Size {len(chunk_data):4d} | " f"Encoding: {encoding_display:>12} | " f"Confidence: {confidence:6.4f}")
# 如果置信度为0,打印数据内容 # if confidence == 0.0 and len(chunk_data) > 0: # print ("\n") # print_hex(chunk_data) # print ("\n")
chunk_number += 1
print("-" * 50 + f" Analysis of '{source_name}' finished. " + "-" * 10 + "\n")
def main(): """ 主函数,处理命令行参数并调用相应的检测函数。 """ if len(sys.argv) < 2: print("No filename provided. Reading binary data from STDIN...", file=sys.stderr) try: data = sys.stdin.buffer.read() detect_chunks_from_bytes(data, source_name="STDIN", chunk_size=1024) except KeyboardInterrupt: print("\nInterrupted by user.", file=sys.stderr) except Exception as e: print(f"Error reading from STDIN: {e}", file=sys.stderr) sys.exit(0)
# 默认参数 chunk_size = 1024 from_head_bytes = 0 from_file_offset = 0 # 新增默认参数 filenames = []
# 解析命令行参数 i = 1 while i < len(sys.argv): if sys.argv[i] == '-s': if i + 1 < len(sys.argv): try: chunk_size = int(sys.argv[i + 1]) if chunk_size <= 0: raise ValueError("Chunk size must be positive.") i += 2 except ValueError as e: print(f"Error: Invalid chunk size '-s {sys.argv[i + 1]}': {e}", file=sys.stderr) sys.exit(1) else: print("Error: Option '-s' requires an argument.", file=sys.stderr) sys.exit(1) elif sys.argv[i] == '-h': if i + 1 < len(sys.argv): try: from_head_bytes = int(sys.argv[i + 1]) if from_head_bytes < 0: raise ValueError("Head bytes to skip must be non-negative.") i += 2 except ValueError as e: print(f"Error: Invalid head bytes '-h {sys.argv[i + 1]}': {e}", file=sys.stderr) sys.exit(1) else: print("Error: Option '-h' requires an argument.", file=sys.stderr) sys.exit(1) # --- 新增:解析 -o 参数 --- elif sys.argv[i] == '-o': if i + 1 < len(sys.argv): try: from_file_offset = int(sys.argv[i + 1]) if from_file_offset < 0: raise ValueError("File offset must be non-negative.") i += 2 except ValueError as e: print(f"Error: Invalid file offset '-o {sys.argv[i + 1]}': {e}", file=sys.stderr) sys.exit(1) else: print("Error: Option '-o' requires an argument.", file=sys.stderr) sys.exit(1) # --- 新增结束 --- else: filenames.append(sys.argv[i]) i += 1
if not filenames: print("Error: No filename provided.", file=sys.stderr) sys.exit(1)
# 对每个提供的文件进行处理 for filename in filenames: # --- 修改:传递 from_file_offset 参数 --- detect_chunks_from_file(filename, chunk_size, from_head_bytes, from_file_offset)
if __name__ == "__main__": main()
|