A mht file convertor
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

117 lines
4.1 KiB

4 years ago
import base64
import os
import platform
import re
class Convertor:
def __init__(self):
self.mht_file_path = None
self.output_folder = None
self.clear_console = None
self.mht_file_size = None
self.mht_file = None
self.boundary = None
if self.prepare():
self.convert()
self.mht_file.close()
def prepare(self):
if platform.system() == 'Windows':
self.clear_console = lambda: os.system('cls')
else:
self.clear_console = lambda: os.system('clear')
self.clear_console()
print('======mht2html======\n' +
'A mht file convertor\n' +
'by kdxcxs(cx@kdxcxs.com)\n')
while True:
self.mht_file_path = input('[#] Please entry the path to the mht file to be converted\n===>')
try:
self.mht_file = open(self.mht_file_path, 'r', encoding='utf8')
self.mht_file_size = os.stat(self.mht_file_path).st_size
break
except FileNotFoundError:
print('[!] No such file')
continue
self.output_folder = input('[#] Please entry the path to the output folder\n===>')
if not os.path.isdir(self.output_folder):
os.makedirs(self.output_folder)
if self.output_folder[-1] != os.sep:
self.output_folder += os.sep
return True
def convert(self):
print('[#] Start converting')
# header
if not self.analyse_header(self.read_mht_file()):
return
# content
while self.mht_file.readable():
if self.analyse_content(self.read_mht_file()):
continue
break
def print_progress(self):
print('[#] %.3f%% converted' % (self.mht_file.tell() / self.mht_file_size * 100), end='\r')
def read_mht_file(self):
header_mode = False
if self.mht_file.tell() == 0:
header_mode = True
file_content = ''
if not header_mode:
while self.mht_file.readable():
if self.mht_file.readline() == f'--{self.boundary}\n':
self.print_progress()
break
self.print_progress()
continue
while self.mht_file.readable():
line = self.mht_file.readline()
self.print_progress()
if line == '\n':
if header_mode:
break
header_mode = True # if it's the first time to read the file we just need one '\n' to finish to get
file_content += line # the file's header,or we need to wait the second '\n' to get both header and content
return file_content
def analyse_header(self, header):
if 'boundary' not in header:
print('[!] Not a mht file')
return False
# find boundary
boundary_re = re.search('boundary=".*"', header)
self.boundary = header[boundary_re.span()[0] + 10: boundary_re.span()[1] - 1]
return True
def analyse_content(self, content):
# only support text and base64 image file
# TODO:support other file formats
header, content = content.split('\n\n')
header = header.replace(' ', '')
header = header.split('\n')
for i in range(len(header)):
header.extend(header.pop(0).split(':'))
get_header = lambda key_name: header[header.index(key_name) + 1]
if get_header('Content-Type') == 'text/html':
with open(f'{self.output_folder}index.html', 'w', encoding='utf8') as htmlf:
htmlf.write(content)
return True
elif get_header('Content-Type').split('/')[0] == 'image':
with open(f'{self.output_folder}{get_header("Content-Location")}', 'wb') as imgf:
imgf.write(base64.decodebytes(content.encode()))
return True
if __name__ == '__main__':
convertor = Convertor()