import glob
import os
import re
from datetime import datetime, timedelta, timezone
JST = timezone(timedelta(hours=9))
def fix_lastupdated_date(content):
"""最終更新日のUTC表記をJSTに補正する"""
# を検出
pattern = r'(最終更新日: )'
match = re.search(pattern, content)
if not match:
return content
prefix = match.group(1)
iso_str = match.group(2)
middle = match.group(3)
display_text = match.group(4)
suffix = match.group(5)
utc_dt = datetime.fromisoformat(iso_str.replace("Z", "+00:00"))
jst_dt = utc_dt.astimezone(JST)
new_iso = jst_dt.strftime("%Y-%m-%dT%H:%M:%S.000Z").replace("Z", f"+09:00")
new_display = jst_dt.strftime("%Y/%m/%d %H:%M")
return content.replace(
match.group(0), prefix + new_iso + middle + new_display + suffix
)
def remove_null_characters_from_html(directory, ignore_files=[], ignore_directories=[]):
html_files = glob.glob(os.path.join(directory, "**/*.html"), recursive=True)
for file_path in html_files:
if (
os.path.basename(file_path) in ignore_files
or os.path.dirname(file_path) in ignore_directories
):
continue
with open(file_path, "r", encoding="utf-8") as file:
content = file.read()
content = content.replace("\0", "")
content = fix_lastupdated_date(content)
with open(file_path, "w", encoding="utf-8") as file:
file.write(content)
if __name__ == "__main__":
target_directory = "./dist"
ignore_files = ["404.html"]
ignore_directories = ["assets", "img"]
remove_null_characters_from_html(target_directory, ignore_files, ignore_directories)