def
RepairGB18030File(srcFile, dstFile, explicit
=
False
):
with
open
(srcFile, mode
=
'rb'
) as fin:
byteText
=
fin.read()
byteLength
=
len
(byteText)
print
(
'byteLength: %d'
%
byteLength)
pos
=
0
byteList
=
list
()
byteText
+
=
b
'\x0d\x0a\x0d\x0a'
while
pos < byteLength:
byte1
=
bytes([byteText[pos]])
byte2
=
bytes([byteText[pos
+
1
]])
byte3
=
bytes([byteText[pos
+
2
]])
byte4
=
bytes([byteText[pos
+
3
]])
if
b
'\x00'
<
=
byte1 <
=
b
'\x7f'
:
pos
+
=
1
if
byte1.decode(
'gb18030'
).isprintable():
byteList.append(byte1)
continue
if
byte1
in
(b
'\x0d'
, b
'\x0a'
):
byteList.append(byte1)
continue
if
explicit:
byteNew
=
(
"0x%02X"
%
ord
(byte1)).encode(
'gb18030'
)
byteList.append(byteNew)
else
:
byteList.append(byte1)
elif
b
'\x81'
<
=
byte1 <
=
b
'\xfe'
:
if
(b
'\x40'
<
=
byte2 <
=
b
'\x7e'
)
or
(b
'\x80'
<
=
byte2 <
=
b
'\xfe'
):
pos
+
=
2
byteList.extend([byte1, byte2])
continue
if
b
'\x30'
<
=
byte2 <
=
b
'\x39'
:
if
(b
'\x81'
<
=
byte3 <
=
b
'\xfe'
)
or
(b
'\x30'
<
=
byte4 <
=
b
'\x39'
):
pos
+
=
4
byteList.extend([byte1, byte2, byte3, byte4])
continue
pos
+
=
1
byteNew
=
(
"0x%02X"
%
ord
(byte1)).encode(
'gb18030'
)
byteList.append(byteNew)
continue
pos
+
=
1
byteNew
=
(
"0x%02X"
%
ord
(byte1)).encode(
'gb18030'
)
byteList.append(byteNew)
else
:
byteNew
=
(
"0x%02X"
%
ord
(byte1)).encode(
'gb18030'
)
pos
+
=
1
byteList.append(byteNew)
repairedText
=
b'
'.join(byteList).decode('
gb18030')
with
open
(dstFile, mode
=
'w'
, encoding
=
'gb18030'
) as fout:
fout.write(repairedText)