kdxcxs
4 years ago
commit
fa42b9f3e2
2 changed files with 123 additions and 0 deletions
@ -0,0 +1,7 @@ |
|||
# mht2html |
|||
|
|||
A mht file convertor |
|||
|
|||
## TODO |
|||
1.add fast mode(use file.readline() instead of file.readline()) |
|||
2.support other file formats |
@ -0,0 +1,116 @@ |
|||
import base64 |
|||
import os |
|||
import platform |
|||
import re |
|||
|
|||
|
|||
class Convertor: |
|||
def __init__(self): |
|||
self.mht_file_path = None |
|||
self.output_folder = None |
|||
self.clear_console = None |
|||
self.mht_file_size = None |
|||
self.mht_file = None |
|||
self.boundary = None |
|||
if self.prepare(): |
|||
self.convert() |
|||
self.mht_file.close() |
|||
|
|||
def prepare(self): |
|||
if platform.system() == 'Windows': |
|||
self.clear_console = lambda: os.system('cls') |
|||
else: |
|||
self.clear_console = lambda: os.system('clear') |
|||
|
|||
self.clear_console() |
|||
print('======mht2html======\n' + |
|||
'A mht file convertor\n' + |
|||
'by kdxcxs(cx@kdxcxs.com)\n') |
|||
|
|||
while True: |
|||
self.mht_file_path = input('[#] Please entry the path to the mht file to be converted\n===>') |
|||
try: |
|||
self.mht_file = open(self.mht_file_path, 'r', encoding='utf8') |
|||
self.mht_file_size = os.stat(self.mht_file_path).st_size |
|||
break |
|||
except FileNotFoundError: |
|||
print('[!] No such file') |
|||
continue |
|||
self.output_folder = input('[#] Please entry the path to the output folder\n===>') |
|||
if not os.path.isdir(self.output_folder): |
|||
os.makedirs(self.output_folder) |
|||
if self.output_folder[-1] != os.sep: |
|||
self.output_folder += os.sep |
|||
return True |
|||
|
|||
def convert(self): |
|||
print('[#] Start converting') |
|||
|
|||
# header |
|||
if not self.analyse_header(self.read_mht_file()): |
|||
return |
|||
|
|||
# content |
|||
while self.mht_file.readable(): |
|||
if self.analyse_content(self.read_mht_file()): |
|||
continue |
|||
break |
|||
|
|||
def print_progress(self): |
|||
print('[#] %.3f%% converted' % (self.mht_file.tell() / self.mht_file_size * 100), end='\r') |
|||
|
|||
def read_mht_file(self): |
|||
header_mode = False |
|||
if self.mht_file.tell() == 0: |
|||
header_mode = True |
|||
file_content = '' |
|||
|
|||
if not header_mode: |
|||
while self.mht_file.readable(): |
|||
if self.mht_file.readline() == f'--{self.boundary}\n': |
|||
self.print_progress() |
|||
break |
|||
self.print_progress() |
|||
continue |
|||
|
|||
while self.mht_file.readable(): |
|||
line = self.mht_file.readline() |
|||
self.print_progress() |
|||
if line == '\n': |
|||
if header_mode: |
|||
break |
|||
header_mode = True # if it's the first time to read the file we just need one '\n' to finish to get |
|||
file_content += line # the file's header,or we need to wait the second '\n' to get both header and content |
|||
|
|||
return file_content |
|||
|
|||
def analyse_header(self, header): |
|||
if 'boundary' not in header: |
|||
print('[!] Not a mht file') |
|||
return False |
|||
# find boundary |
|||
boundary_re = re.search('boundary=".*"', header) |
|||
self.boundary = header[boundary_re.span()[0] + 10: boundary_re.span()[1] - 1] |
|||
return True |
|||
|
|||
def analyse_content(self, content): |
|||
# only support text and base64 image file |
|||
# TODO:support other file formats |
|||
header, content = content.split('\n\n') |
|||
header = header.replace(' ', '') |
|||
header = header.split('\n') |
|||
for i in range(len(header)): |
|||
header.extend(header.pop(0).split(':')) |
|||
get_header = lambda key_name: header[header.index(key_name) + 1] |
|||
if get_header('Content-Type') == 'text/html': |
|||
with open(f'{self.output_folder}index.html', 'w', encoding='utf8') as htmlf: |
|||
htmlf.write(content) |
|||
return True |
|||
elif get_header('Content-Type').split('/')[0] == 'image': |
|||
with open(f'{self.output_folder}{get_header("Content-Location")}', 'wb') as imgf: |
|||
imgf.write(base64.decodebytes(content.encode())) |
|||
return True |
|||
|
|||
|
|||
if __name__ == '__main__': |
|||
convertor = Convertor() |
Loading…
Reference in new issue