You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
117 lines
4.1 KiB
117 lines
4.1 KiB
4 years ago
|
import base64
|
||
|
import os
|
||
|
import platform
|
||
|
import re
|
||
|
|
||
|
|
||
|
class Convertor:
|
||
|
def __init__(self):
|
||
|
self.mht_file_path = None
|
||
|
self.output_folder = None
|
||
|
self.clear_console = None
|
||
|
self.mht_file_size = None
|
||
|
self.mht_file = None
|
||
|
self.boundary = None
|
||
|
if self.prepare():
|
||
|
self.convert()
|
||
|
self.mht_file.close()
|
||
|
|
||
|
def prepare(self):
|
||
|
if platform.system() == 'Windows':
|
||
|
self.clear_console = lambda: os.system('cls')
|
||
|
else:
|
||
|
self.clear_console = lambda: os.system('clear')
|
||
|
|
||
|
self.clear_console()
|
||
|
print('======mht2html======\n' +
|
||
|
'A mht file convertor\n' +
|
||
|
'by kdxcxs(cx@kdxcxs.com)\n')
|
||
|
|
||
|
while True:
|
||
|
self.mht_file_path = input('[#] Please entry the path to the mht file to be converted\n===>')
|
||
|
try:
|
||
|
self.mht_file = open(self.mht_file_path, 'r', encoding='utf8')
|
||
|
self.mht_file_size = os.stat(self.mht_file_path).st_size
|
||
|
break
|
||
|
except FileNotFoundError:
|
||
|
print('[!] No such file')
|
||
|
continue
|
||
|
self.output_folder = input('[#] Please entry the path to the output folder\n===>')
|
||
|
if not os.path.isdir(self.output_folder):
|
||
|
os.makedirs(self.output_folder)
|
||
|
if self.output_folder[-1] != os.sep:
|
||
|
self.output_folder += os.sep
|
||
|
return True
|
||
|
|
||
|
def convert(self):
|
||
|
print('[#] Start converting')
|
||
|
|
||
|
# header
|
||
|
if not self.analyse_header(self.read_mht_file()):
|
||
|
return
|
||
|
|
||
|
# content
|
||
|
while self.mht_file.readable():
|
||
|
if self.analyse_content(self.read_mht_file()):
|
||
|
continue
|
||
|
break
|
||
|
|
||
|
def print_progress(self):
|
||
|
print('[#] %.3f%% converted' % (self.mht_file.tell() / self.mht_file_size * 100), end='\r')
|
||
|
|
||
|
def read_mht_file(self):
|
||
|
header_mode = False
|
||
|
if self.mht_file.tell() == 0:
|
||
|
header_mode = True
|
||
|
file_content = ''
|
||
|
|
||
|
if not header_mode:
|
||
|
while self.mht_file.readable():
|
||
|
if self.mht_file.readline() == f'--{self.boundary}\n':
|
||
|
self.print_progress()
|
||
|
break
|
||
|
self.print_progress()
|
||
|
continue
|
||
|
|
||
|
while self.mht_file.readable():
|
||
|
line = self.mht_file.readline()
|
||
|
self.print_progress()
|
||
|
if line == '\n':
|
||
|
if header_mode:
|
||
|
break
|
||
|
header_mode = True # if it's the first time to read the file we just need one '\n' to finish to get
|
||
|
file_content += line # the file's header,or we need to wait the second '\n' to get both header and content
|
||
|
|
||
|
return file_content
|
||
|
|
||
|
def analyse_header(self, header):
|
||
|
if 'boundary' not in header:
|
||
|
print('[!] Not a mht file')
|
||
|
return False
|
||
|
# find boundary
|
||
|
boundary_re = re.search('boundary=".*"', header)
|
||
|
self.boundary = header[boundary_re.span()[0] + 10: boundary_re.span()[1] - 1]
|
||
|
return True
|
||
|
|
||
|
def analyse_content(self, content):
|
||
|
# only support text and base64 image file
|
||
|
# TODO:support other file formats
|
||
|
header, content = content.split('\n\n')
|
||
|
header = header.replace(' ', '')
|
||
|
header = header.split('\n')
|
||
|
for i in range(len(header)):
|
||
|
header.extend(header.pop(0).split(':'))
|
||
|
get_header = lambda key_name: header[header.index(key_name) + 1]
|
||
|
if get_header('Content-Type') == 'text/html':
|
||
|
with open(f'{self.output_folder}index.html', 'w', encoding='utf8') as htmlf:
|
||
|
htmlf.write(content)
|
||
|
return True
|
||
|
elif get_header('Content-Type').split('/')[0] == 'image':
|
||
|
with open(f'{self.output_folder}{get_header("Content-Location")}', 'wb') as imgf:
|
||
|
imgf.write(base64.decodebytes(content.encode()))
|
||
|
return True
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
convertor = Convertor()
|