Python二进制文件编码探测工具

Python二进制文件编码探测工具

data-ad-format="fluid" data-ad-layout-key="-7k+ex-4a-9w+4a">

背景实现基于python语言cchardet库的二进制文件分析程序,按照预设分段参数对文件进行读取和cchardet的文本编码探测。脚本具备跳过文件头n字节,按照m字节分段二进制文件及分段后数据连续4字节探测功能。结果输出会展示每段的序号,偏移起始,片内置信度识别偏移字节,片大小,编码方式,置信度,高置信度提示信息字段;

如何使用脚本:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
# 1. 基本用法:分析整个文件
python encoding_detector.py myfile.bin

# 2. 指定块大小
python encoding_detector.py -s 512 myfile.bin

# 3. 跳过每个块的前 10 个字节
python encoding_detector.py -s 100 -h 10 myfile.bin

# 4. 从文件偏移 1116 开始分析
python encoding_detector.py -s 100 -o 1116 ../ftp-pcap/ftp-utf8-long.pcap

# 5. 结合使用:从偏移 1000 开始,每块 256 字节,跳过每块前 20 字节
python encoding_detector.py -s 256 -h 20 -o 1000 myfile.bin

# 6. 通过管道输入
cat myfile.bin | python encoding_detector.py -s 512

Python脚本是实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
#!/usr/bin/env python3
import cchardet
import sys
import os

def print_hex(data, width=16):
"""以十六进制和ASCII形式打印字节数据"""
for i in range(0, len(data), width):
# 十六进制部分
hex_part = ' '.join(f'{byte:02x}' for byte in data[i:i+width])
# ASCII部分 (可打印字符或'.')
ascii_part = ''.join(chr(byte) if 32 <= byte <= 126 else '.' for byte in data&#91;i:i+width])
# 打印地址偏移、十六进制和ASCII
print(f'{i:08x}: {hex_part:<{width*3}} |{ascii_part}|')

def detect_chunks_from_file(filename, chunk_size=1024, from_head_bytes=0, from_file_offset=0):
"""
将文件按指定大小切块,并对每个块进行编码检测。
如果检测置信度为0,则尝试偏移1-4字节重新检测。
from_file_offset: 从文件的哪个字节偏移开始读取。
"""
if not os.path.exists(filename):
print(f"Error: File '{filename}' does not exist.", file=sys.stderr)
return

try:
file_size = os.path.getsize(filename)
print(f"Analyzing file: {filename} (Total size: {file_size} bytes)")
print(f"Chunk size: {chunk_size} bytes")
if from_head_bytes > 0:
print(f"Skipping first {from_head_bytes} bytes of each chunk for detection.")
if from_file_offset > 0:
print(f"Starting analysis from file offset: {from_file_offset}")
print("-" * 50)

with open(filename, 'rb') as f:
# 定位到文件的起始偏移
if from_file_offset > 0:
f.seek(from_file_offset)

chunk_number = 0
while True:
chunk_data = f.read(chunk_size)
if not chunk_
break

# 计算当前块在原始文件中的基础偏移量
offset = from_file_offset + chunk_number * chunk_size

# 裁剪用于检测的数据(跳过头部字节)
detection_data = chunk_data&#91;from_head_bytes:] if len(chunk_data) > from_head_bytes else b''

# --- 初始检测 ---
encoding = None
confidence = 0.0
offset_by_used = 0 # 记录最终使用的偏移量

if len(detection_data) > 0:
try:
result = cchardet.detect(detection_data)
if isinstance(result, dict):
encoding = result.get('encoding')
temp_confidence = result.get('confidence')
if temp_confidence is None:
confidence = 0.0
else:
confidence = temp_confidence

if encoding is not None and not isinstance(encoding, str):
print(f"Warning: Unexpected encoding type in chunk {chunk_number}: {type(encoding)}", file=sys.stderr)
encoding = str(encoding) if encoding is not None else None
else:
print(f"Warning: cchardet returned unexpected type in chunk {chunk_number}: {type(result)}", file=sys.stderr)
except Exception as e:
print(f"Warning: cchardet failed on chunk {chunk_number}: {e}", file=sys.stderr)
encoding = "Error"
confidence = 0.0

# --- 偏移优化逻辑 ---
max_offset_attempts = 4
if confidence == 0.0 and len(detection_data) > max_offset_attempts:
for offset_by in range(1, max_offset_attempts + 1):
if len(detection_data) > offset_by:
adjusted_detection_data = detection_data&#91;offset_by:]
if len(adjusted_detection_data) > 0:
try:
adjusted_result = cchardet.detect(adjusted_detection_data)
if isinstance(adjusted_result, dict):
adjusted_confidence = adjusted_result.get('confidence')
if adjusted_confidence is None:
adjusted_confidence = 0.0

if adjusted_confidence > confidence:
encoding = adjusted_result.get('encoding')
confidence = adjusted_confidence
offset_by_used = offset_by # 记录使用的偏移量

if confidence > 0.0:
break
except Exception:
pass
else:
break

# --- 格式化输出 ---
encoding_display = encoding if encoding is not None else "N/A"
output_line = (f"Chunk {chunk_number:4d} | Offset {offset:8d} | "
f"offset_by {offset_by_used:2d} | "
f"Size {len(chunk_data):4d} | "
f"Encoding: {encoding_display:>12} | "
f"Confidence: {confidence:6.4f}")

# 可以根据置信度调整输出格式,例如高亮高置信度结果
if confidence >= 0.75:
print(output_line) # 或用不同颜色/符号标记,这里简化为普通打印
else:
print(output_line)

# 如果置信度为0,可以选择打印数据内容(当前被注释掉)
# if confidence == 0.0 and len(chunk_data) > 0:
# print ("\n")
# print_hex(chunk_data)
# print ("\n")

chunk_number += 1

# 文件读取结束后的检查
# f.tell() 在 seek 后返回的是绝对位置
absolute_tell = f.tell()
if absolute_tell < file_size:
print(f"Warning: Stopped reading before end of file '{filename}'. "
f"Read up to file offset {absolute_tell} bytes out of {file_size} bytes.", file=sys.stderr)

except IOError as e:
print(f"Error reading file '{filename}': {e}", file=sys.stderr)
except Exception as e:
print(f"An unexpected error occurred while processing '{filename}': {e}", file=sys.stderr)

print("-" * 50 + f" Analysis of '{filename}' finished. " + "-" * 10 + "\n")

def detect_chunks_from_bytes(data, source_name="Byte Input", chunk_size=1024, from_head_bytes=0):
"""
将字节数据按指定大小切块,并对每个块进行编码检测。
如果检测置信度为0,则尝试偏移1-3字节重新检测。
"""
data_len = len(data)
print(f"Analyzing data from: {source_name} (Total size: {data_len} bytes)")
print(f"Chunk size: {chunk_size} bytes")
if from_head_bytes > 0:
print(f"Skipping first {from_head_bytes} bytes of each chunk for detection.")
print("-" * 50)

if data_len == 0:
print("Input data is empty.")
return

chunk_number = 0
for i in range(0, data_len, chunk_size):
chunk_data = data&#91;i:i + chunk_size]
if not chunk_
break

offset = i
detection_data = chunk_data&#91;from_head_bytes:] if len(chunk_data) > from_head_bytes else b''

encoding = None
confidence = 0.0

if len(detection_data) > 0:
try:
result = cchardet.detect(detection_data)
if isinstance(result, dict):
encoding = result.get('encoding')
temp_confidence = result.get('confidence')
if temp_confidence is None:
confidence = 0.0
else:
confidence = temp_confidence

if encoding is not None and not isinstance(encoding, str):
print(f"Warning: Unexpected encoding type in chunk {chunk_number}: {type(encoding)}", file=sys.stderr)
encoding = str(encoding) if encoding is not None else None
else:
print(f"Warning: cchardet returned unexpected type in chunk {chunk_number}: {type(result)}", file=sys.stderr)
except Exception as e:
print(f"Warning: cchardet failed on chunk {chunk_number}: {e}", file=sys.stderr)
encoding = "Error"
confidence = 0.0

# --- 偏移优化逻辑 (针对 bytes 输入)---
max_offset_attempts = 3
offset_by_used = 0
if confidence == 0.0 and len(detection_data) > max_offset_attempts:
for offset_by in range(1, max_offset_attempts + 1):
if len(detection_data) > offset_by:
adjusted_detection_data = detection_data&#91;offset_by:]
if len(adjusted_detection_data) > 0:
try:
adjusted_result = cchardet.detect(adjusted_detection_data)
if isinstance(adjusted_result, dict):
adjusted_confidence = adjusted_result.get('confidence')
if adjusted_confidence is None:
adjusted_confidence = 0.0

if adjusted_confidence > confidence:
encoding = adjusted_result.get('encoding')
confidence = adjusted_confidence
offset_by_used = offset_by

if confidence > 0.0:
break
except Exception:
pass
else:
break

# 格式化输出 (bytes 输入也显示 offset_by)
encoding_display = encoding if encoding is not None else "N/A"
print(f"Chunk {chunk_number:4d} | Offset {offset:8d} | "
f"offset_by {offset_by_used:2d} | " # 添加 offset_by 显示
f"Size {len(chunk_data):4d} | "
f"Encoding: {encoding_display:>12} | "
f"Confidence: {confidence:6.4f}")

# 如果置信度为0,打印数据内容
# if confidence == 0.0 and len(chunk_data) > 0:
# print ("\n")
# print_hex(chunk_data)
# print ("\n")

chunk_number += 1

print("-" * 50 + f" Analysis of '{source_name}' finished. " + "-" * 10 + "\n")

def main():
"""
主函数,处理命令行参数并调用相应的检测函数。
"""
if len(sys.argv) < 2:
print("No filename provided. Reading binary data from STDIN...", file=sys.stderr)
try:
data = sys.stdin.buffer.read()
detect_chunks_from_bytes(data, source_name="STDIN", chunk_size=1024)
except KeyboardInterrupt:
print("\nInterrupted by user.", file=sys.stderr)
except Exception as e:
print(f"Error reading from STDIN: {e}", file=sys.stderr)
sys.exit(0)

# 默认参数
chunk_size = 1024
from_head_bytes = 0
from_file_offset = 0 # 新增默认参数
filenames = &#91;]

# 解析命令行参数
i = 1
while i < len(sys.argv):
if sys.argv&#91;i] == '-s':
if i + 1 < len(sys.argv):
try:
chunk_size = int(sys.argv&#91;i + 1])
if chunk_size <= 0:
raise ValueError("Chunk size must be positive.")
i += 2
except ValueError as e:
print(f"Error: Invalid chunk size '-s {sys.argv&#91;i + 1]}': {e}", file=sys.stderr)
sys.exit(1)
else:
print("Error: Option '-s' requires an argument.", file=sys.stderr)
sys.exit(1)
elif sys.argv&#91;i] == '-h':
if i + 1 < len(sys.argv):
try:
from_head_bytes = int(sys.argv&#91;i + 1])
if from_head_bytes < 0:
raise ValueError("Head bytes to skip must be non-negative.")
i += 2
except ValueError as e:
print(f"Error: Invalid head bytes '-h {sys.argv&#91;i + 1]}': {e}", file=sys.stderr)
sys.exit(1)
else:
print("Error: Option '-h' requires an argument.", file=sys.stderr)
sys.exit(1)
# --- 新增:解析 -o 参数 ---
elif sys.argv&#91;i] == '-o':
if i + 1 < len(sys.argv):
try:
from_file_offset = int(sys.argv&#91;i + 1])
if from_file_offset < 0:
raise ValueError("File offset must be non-negative.")
i += 2
except ValueError as e:
print(f"Error: Invalid file offset '-o {sys.argv&#91;i + 1]}': {e}", file=sys.stderr)
sys.exit(1)
else:
print("Error: Option '-o' requires an argument.", file=sys.stderr)
sys.exit(1)
# --- 新增结束 ---
else:
filenames.append(sys.argv&#91;i])
i += 1

if not filenames:
print("Error: No filename provided.", file=sys.stderr)
sys.exit(1)

# 对每个提供的文件进行处理
for filename in filenames:
# --- 修改:传递 from_file_offset 参数 ---
detect_chunks_from_file(filename, chunk_size, from_head_bytes, from_file_offset)

if __name__ == "__main__":
main()

关键词(keywords):文本编码,二进制,python文本编码,cchardet

data-ad-format="auto" data-full-width-responsive="true">