kdxcxs
4 years ago
commit
fa42b9f3e2
2 changed files with 123 additions and 0 deletions
@ -0,0 +1,7 @@ |
|||||
|
# mht2html |
||||
|
|
||||
|
A mht file convertor |
||||
|
|
||||
|
## TODO |
||||
|
1.add fast mode(use file.readline() instead of file.readline()) |
||||
|
2.support other file formats |
@ -0,0 +1,116 @@ |
|||||
|
import base64 |
||||
|
import os |
||||
|
import platform |
||||
|
import re |
||||
|
|
||||
|
|
||||
|
class Convertor: |
||||
|
def __init__(self): |
||||
|
self.mht_file_path = None |
||||
|
self.output_folder = None |
||||
|
self.clear_console = None |
||||
|
self.mht_file_size = None |
||||
|
self.mht_file = None |
||||
|
self.boundary = None |
||||
|
if self.prepare(): |
||||
|
self.convert() |
||||
|
self.mht_file.close() |
||||
|
|
||||
|
def prepare(self): |
||||
|
if platform.system() == 'Windows': |
||||
|
self.clear_console = lambda: os.system('cls') |
||||
|
else: |
||||
|
self.clear_console = lambda: os.system('clear') |
||||
|
|
||||
|
self.clear_console() |
||||
|
print('======mht2html======\n' + |
||||
|
'A mht file convertor\n' + |
||||
|
'by kdxcxs(cx@kdxcxs.com)\n') |
||||
|
|
||||
|
while True: |
||||
|
self.mht_file_path = input('[#] Please entry the path to the mht file to be converted\n===>') |
||||
|
try: |
||||
|
self.mht_file = open(self.mht_file_path, 'r', encoding='utf8') |
||||
|
self.mht_file_size = os.stat(self.mht_file_path).st_size |
||||
|
break |
||||
|
except FileNotFoundError: |
||||
|
print('[!] No such file') |
||||
|
continue |
||||
|
self.output_folder = input('[#] Please entry the path to the output folder\n===>') |
||||
|
if not os.path.isdir(self.output_folder): |
||||
|
os.makedirs(self.output_folder) |
||||
|
if self.output_folder[-1] != os.sep: |
||||
|
self.output_folder += os.sep |
||||
|
return True |
||||
|
|
||||
|
def convert(self): |
||||
|
print('[#] Start converting') |
||||
|
|
||||
|
# header |
||||
|
if not self.analyse_header(self.read_mht_file()): |
||||
|
return |
||||
|
|
||||
|
# content |
||||
|
while self.mht_file.readable(): |
||||
|
if self.analyse_content(self.read_mht_file()): |
||||
|
continue |
||||
|
break |
||||
|
|
||||
|
def print_progress(self): |
||||
|
print('[#] %.3f%% converted' % (self.mht_file.tell() / self.mht_file_size * 100), end='\r') |
||||
|
|
||||
|
def read_mht_file(self): |
||||
|
header_mode = False |
||||
|
if self.mht_file.tell() == 0: |
||||
|
header_mode = True |
||||
|
file_content = '' |
||||
|
|
||||
|
if not header_mode: |
||||
|
while self.mht_file.readable(): |
||||
|
if self.mht_file.readline() == f'--{self.boundary}\n': |
||||
|
self.print_progress() |
||||
|
break |
||||
|
self.print_progress() |
||||
|
continue |
||||
|
|
||||
|
while self.mht_file.readable(): |
||||
|
line = self.mht_file.readline() |
||||
|
self.print_progress() |
||||
|
if line == '\n': |
||||
|
if header_mode: |
||||
|
break |
||||
|
header_mode = True # if it's the first time to read the file we just need one '\n' to finish to get |
||||
|
file_content += line # the file's header,or we need to wait the second '\n' to get both header and content |
||||
|
|
||||
|
return file_content |
||||
|
|
||||
|
def analyse_header(self, header): |
||||
|
if 'boundary' not in header: |
||||
|
print('[!] Not a mht file') |
||||
|
return False |
||||
|
# find boundary |
||||
|
boundary_re = re.search('boundary=".*"', header) |
||||
|
self.boundary = header[boundary_re.span()[0] + 10: boundary_re.span()[1] - 1] |
||||
|
return True |
||||
|
|
||||
|
def analyse_content(self, content): |
||||
|
# only support text and base64 image file |
||||
|
# TODO:support other file formats |
||||
|
header, content = content.split('\n\n') |
||||
|
header = header.replace(' ', '') |
||||
|
header = header.split('\n') |
||||
|
for i in range(len(header)): |
||||
|
header.extend(header.pop(0).split(':')) |
||||
|
get_header = lambda key_name: header[header.index(key_name) + 1] |
||||
|
if get_header('Content-Type') == 'text/html': |
||||
|
with open(f'{self.output_folder}index.html', 'w', encoding='utf8') as htmlf: |
||||
|
htmlf.write(content) |
||||
|
return True |
||||
|
elif get_header('Content-Type').split('/')[0] == 'image': |
||||
|
with open(f'{self.output_folder}{get_header("Content-Location")}', 'wb') as imgf: |
||||
|
imgf.write(base64.decodebytes(content.encode())) |
||||
|
return True |
||||
|
|
||||
|
|
||||
|
if __name__ == '__main__': |
||||
|
convertor = Convertor() |
Loading…
Reference in new issue