commit fa42b9f3e29de1f3ac847bbe86c942217d272590 Author: kdxcxs Date: Tue Sep 1 21:31:23 2020 +0800 Init commit diff --git a/README.md b/README.md new file mode 100644 index 0000000..c378819 --- /dev/null +++ b/README.md @@ -0,0 +1,7 @@ +# mht2html + +A mht file convertor + +## TODO +1.add fast mode(use file.readline() instead of file.readline()) +2.support other file formats \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..8d82897 --- /dev/null +++ b/main.py @@ -0,0 +1,116 @@ +import base64 +import os +import platform +import re + + +class Convertor: + def __init__(self): + self.mht_file_path = None + self.output_folder = None + self.clear_console = None + self.mht_file_size = None + self.mht_file = None + self.boundary = None + if self.prepare(): + self.convert() + self.mht_file.close() + + def prepare(self): + if platform.system() == 'Windows': + self.clear_console = lambda: os.system('cls') + else: + self.clear_console = lambda: os.system('clear') + + self.clear_console() + print('======mht2html======\n' + + 'A mht file convertor\n' + + 'by kdxcxs(cx@kdxcxs.com)\n') + + while True: + self.mht_file_path = input('[#] Please entry the path to the mht file to be converted\n===>') + try: + self.mht_file = open(self.mht_file_path, 'r', encoding='utf8') + self.mht_file_size = os.stat(self.mht_file_path).st_size + break + except FileNotFoundError: + print('[!] No such file') + continue + self.output_folder = input('[#] Please entry the path to the output folder\n===>') + if not os.path.isdir(self.output_folder): + os.makedirs(self.output_folder) + if self.output_folder[-1] != os.sep: + self.output_folder += os.sep + return True + + def convert(self): + print('[#] Start converting') + + # header + if not self.analyse_header(self.read_mht_file()): + return + + # content + while self.mht_file.readable(): + if self.analyse_content(self.read_mht_file()): + continue + break + + def print_progress(self): + print('[#] %.3f%% converted' % (self.mht_file.tell() / self.mht_file_size * 100), end='\r') + + def read_mht_file(self): + header_mode = False + if self.mht_file.tell() == 0: + header_mode = True + file_content = '' + + if not header_mode: + while self.mht_file.readable(): + if self.mht_file.readline() == f'--{self.boundary}\n': + self.print_progress() + break + self.print_progress() + continue + + while self.mht_file.readable(): + line = self.mht_file.readline() + self.print_progress() + if line == '\n': + if header_mode: + break + header_mode = True # if it's the first time to read the file we just need one '\n' to finish to get + file_content += line # the file's header,or we need to wait the second '\n' to get both header and content + + return file_content + + def analyse_header(self, header): + if 'boundary' not in header: + print('[!] Not a mht file') + return False + # find boundary + boundary_re = re.search('boundary=".*"', header) + self.boundary = header[boundary_re.span()[0] + 10: boundary_re.span()[1] - 1] + return True + + def analyse_content(self, content): + # only support text and base64 image file + # TODO:support other file formats + header, content = content.split('\n\n') + header = header.replace(' ', '') + header = header.split('\n') + for i in range(len(header)): + header.extend(header.pop(0).split(':')) + get_header = lambda key_name: header[header.index(key_name) + 1] + if get_header('Content-Type') == 'text/html': + with open(f'{self.output_folder}index.html', 'w', encoding='utf8') as htmlf: + htmlf.write(content) + return True + elif get_header('Content-Type').split('/')[0] == 'image': + with open(f'{self.output_folder}{get_header("Content-Location")}', 'wb') as imgf: + imgf.write(base64.decodebytes(content.encode())) + return True + + +if __name__ == '__main__': + convertor = Convertor()