import base64 import os import platform import re class Convertor: def __init__(self): self.mht_file_path = None self.output_folder = None self.clear_console = None self.mht_file_size = None self.mht_file = None self.boundary = None if self.prepare(): self.convert() self.mht_file.close() def prepare(self): if platform.system() == 'Windows': self.clear_console = lambda: os.system('cls') else: self.clear_console = lambda: os.system('clear') self.clear_console() print('======mht2html======\n' + 'A mht file convertor\n' + 'by kdxcxs(cx@kdxcxs.com)\n') while True: self.mht_file_path = input('[#] Please entry the path to the mht file to be converted\n===>') try: self.mht_file = open(self.mht_file_path, 'r', encoding='utf8') self.mht_file_size = os.stat(self.mht_file_path).st_size break except FileNotFoundError: print('[!] No such file') continue self.output_folder = input('[#] Please entry the path to the output folder\n===>') if not os.path.isdir(self.output_folder): os.makedirs(self.output_folder) if self.output_folder[-1] != os.sep: self.output_folder += os.sep return True def convert(self): print('[#] Start converting') # header if not self.analyse_header(self.read_mht_file()): return # content while self.mht_file.readable(): if self.analyse_content(self.read_mht_file()): continue break def print_progress(self): print('[#] %.3f%% converted' % (self.mht_file.tell() / self.mht_file_size * 100), end='\r') def read_mht_file(self): header_mode = False if self.mht_file.tell() == 0: header_mode = True file_content = '' if not header_mode: while self.mht_file.readable(): if self.mht_file.readline() == f'--{self.boundary}\n': self.print_progress() break self.print_progress() continue while self.mht_file.readable(): line = self.mht_file.readline() self.print_progress() if line == '\n': if header_mode: break header_mode = True # if it's the first time to read the file we just need one '\n' to finish to get file_content += line # the file's header,or we need to wait the second '\n' to get both header and content return file_content def analyse_header(self, header): if 'boundary' not in header: print('[!] Not a mht file') return False # find boundary boundary_re = re.search('boundary=".*"', header) self.boundary = header[boundary_re.span()[0] + 10: boundary_re.span()[1] - 1] return True def analyse_content(self, content): # only support text and base64 image file # TODO:support other file formats header, content = content.split('\n\n') header = header.replace(' ', '') header = header.split('\n') for i in range(len(header)): header.extend(header.pop(0).split(':')) get_header = lambda key_name: header[header.index(key_name) + 1] if get_header('Content-Type') == 'text/html': with open(f'{self.output_folder}index.html', 'w', encoding='utf8') as htmlf: htmlf.write(content) return True elif get_header('Content-Type').split('/')[0] == 'image': with open(f'{self.output_folder}{get_header("Content-Location")}', 'wb') as imgf: imgf.write(base64.decodebytes(content.encode())) return True if __name__ == '__main__': convertor = Convertor()