defsplit_txt_by_toc(file_path): """ Splits a TXT file into multiple files based on a table of contents (TOC). Args: file_path (str): The path to the input TXT file. """ try: withopen(file_path, 'r', encoding='utf-8') as f: content = f.read() except FileNotFoundError: print(f"Error: The file '{file_path}' was not found.") return except Exception as e: print(f"An error occurred while reading the file: {e}") return
# Find the table of contents block toc_start = content.find('*****') if toc_start == -1: print("Error: Table of contents start marker '*****' not found.") return
toc_end = content.find('*****', toc_start + 1) if toc_end == -1: print("Error: Table of contents end marker '*****' not found.") return
# Extract the TOC titles and their order toc_block = content[toc_start + 5:toc_end].strip() toc_titles = [line.strip() for line in toc_block.split('\n') if line.strip()] # Create a new directory for the output files output_dir = os.path.splitext(os.path.basename(file_path))[0] + "_split" os.makedirs(output_dir, exist_ok=True) print(f"Creating output directory: {output_dir}")
# Split the main content main_content = content[toc_end + 5:].strip()
# Function to sanitize filenames defsanitize_filename(name): # Remove characters that are not letters, numbers, hyphens, or underscores # Replace spaces with underscores sanitized_name = re.sub(r'[^\w\s-]', '', name).strip().replace(' ', '_') return sanitized_name
for i, title inenumerate(toc_titles): # Escape special regex characters in the title # Regex to match the title as a standalone line, with optional leading/trailing whitespace pattern = re.compile(rf'^\s*{re.escape(title)}\s*$', re.IGNORECASE | re.MULTILINE) # Find the starting position of the current title in the main content match = pattern.search(main_content) ifnotmatch: print(f"Warning: Title '{title}' not found in the main content. Skipping.") continue start_pos = match.end()
# Find the end position for the current section end_pos = -1 if i + 1 < len(toc_titles): next_title = toc_titles[i + 1] next_pattern = re.compile(rf'^\s*{re.escape(next_title)}\s*$', re.IGNORECASE | re.MULTILINE) next_match = next_pattern.search(main_content, start_pos) if next_match: end_pos = next_match.start() # Extract the content for the current section if end_pos != -1: section_content = main_content[start_pos:end_pos].strip() else: section_content = main_content[start_pos:].strip()
# Create the new file name and path sanitized_title = sanitize_filename(title) file_name = f"{i+1:02d}_{sanitized_title}.txt" file_path = os.path.join(output_dir, file_name)
# Write the content to the new file try: withopen(file_path, 'w', encoding='utf-8') as f: # Add the title back to the beginning of each file f.write(f"{title}\n\n{section_content}\n") print(f"Created file: {file_path}") except Exception as e: print(f"An error occurred while writing file '{file_name}': {e}") # Example usage: # Assuming your file is named 'document.txt' split_txt_by_toc('document.txt')