Initial Commit

2025-09-22 20:19:35 +02:00
commit 3bde24308a
3 changed files with 197 additions and 0 deletions
@@ -0,0 +1,35 @@
+# Toniebox Chapter Extractor
+
+This script extracts chapter information from a Toniebox file, converts it to audio chapters, and splits the audio into separate files.
+
+## Prerequisites
+
+Ensure you have Python 3 installed on your system.
+
+## Installation
+
+1. Clone this repository or download the script files.
+
+2. Navigate to the project directory.
+
+3. If you haven't already install ffmpeg using your favorite package manager
+4. Install the required dependencies using `pip`:
+    ```bash
+    pip install -r requirements.txt
+    ```
+## Usage
+
+To run the script, use the following command:
+
+```bash
+python Taf2Ogg.py <tonie_file> <output_directory>
+    <tonie_file>: The path to the Toniebox file you want to process.
+    <output_directory>: The directory where the extracted chapters will be saved.
+
+python Taf2Ogg.py CONTENT/8D77321C/500304E0 ./output_chapters
+```
+
+## Notes
+
+    The script assumes the sample rate for the Toniebox Opus audio is 48000 Hz.
+    If the Toniebox has not cached the entire file, extraction of all chapters may not be possible.
@@ -0,0 +1,160 @@
+#!/bin/python3
+
+import sys
+import struct
+import re 
+import ffmpeg
+import os 
+from google.protobuf import descriptor_pb2, descriptor_pool, message_factory
+
+# Protobuf definition in descriptor form
+file_descriptor_proto = descriptor_pb2.FileDescriptorProto()
+file_descriptor_proto.name = 'tonie_header.proto'
+file_descriptor_proto.package = 'tonie'
+file_descriptor_proto.syntax = 'proto3'
+
+# TonieHeader message type
+message_descriptor_proto = file_descriptor_proto.message_type.add()
+message_descriptor_proto.name = 'TonieHeader'
+
+# Fields of TonieHeader
+field_data_hash = message_descriptor_proto.field.add()
+field_data_hash.name = 'dataHash'
+field_data_hash.number = 1
+field_data_hash.label = descriptor_pb2.FieldDescriptorProto.LABEL_OPTIONAL
+field_data_hash.type = descriptor_pb2.FieldDescriptorProto.TYPE_BYTES
+
+field_data_length = message_descriptor_proto.field.add()
+field_data_length.name = 'dataLength'
+field_data_length.number = 2
+field_data_length.label = descriptor_pb2.FieldDescriptorProto.LABEL_OPTIONAL
+field_data_length.type = descriptor_pb2.FieldDescriptorProto.TYPE_UINT32
+
+field_timestamp = message_descriptor_proto.field.add()
+field_timestamp.name = 'timestamp'
+field_timestamp.number = 3
+field_timestamp.label = descriptor_pb2.FieldDescriptorProto.LABEL_OPTIONAL
+field_timestamp.type = descriptor_pb2.FieldDescriptorProto.TYPE_UINT32
+
+field_chapter_pages = message_descriptor_proto.field.add()
+field_chapter_pages.name = 'chapterPages'
+field_chapter_pages.number = 4
+field_chapter_pages.label = descriptor_pb2.FieldDescriptorProto.LABEL_REPEATED
+field_chapter_pages.type = descriptor_pb2.FieldDescriptorProto.TYPE_UINT32
+field_chapter_pages.options.packed = True
+
+field_padding = message_descriptor_proto.field.add()
+field_padding.name = 'padding'
+field_padding.number = 5
+field_padding.label = descriptor_pb2.FieldDescriptorProto.LABEL_OPTIONAL
+field_padding.type = descriptor_pb2.FieldDescriptorProto.TYPE_BYTES
+
+# Create a pool and add the FileDescriptorProto
+pool = descriptor_pool.DescriptorPool()
+file_descriptor = pool.Add(file_descriptor_proto)
+
+# Get the message descriptor and create a message class
+message_descriptor = pool.FindMessageTypeByName('tonie.TonieHeader')
+TonieHeader = message_factory.GetMessageClass(message_descriptor)
+
+def ExtractChapters(filename):
+    with open(filename, 'rb') as f:
+        # Read the first 4096 bytes which includes the header
+        header_data = f.read(0x1000)
+
+        # Decode the header
+        header = TonieHeader()
+        header.ParseFromString(header_data[4:])
+
+        return list(header.chapterPages)
+
+
+def read_and_save_binary_file_bytes(input_filename, output_filename, ChapterList):
+    try:
+        with open(input_filename, 'rb') as f:
+            # Save the bytes to the output file
+            try:
+                with open(output_filename, 'wb') as output_file:
+                    i = 0
+                    TimeList = list((""))
+                    f.seek(0)
+                    counter = 0 
+                    print("Scanning file for Chapters...")
+                    while True:
+                        bytes_data = f.read(4096)
+                        if i > 0:
+                            output_file.write(bytes_data)
+                        i +=1
+                        if not bytes_data:
+                            break
+                        index = bytes_data.find(b"OggS")
+
+                        if index != -1:
+                            buf = bytes_data[index+6:index+22]
+                            granule_pos = int.from_bytes(buf[:7], byteorder='little')
+                            time_seconds = granule_pos / 48000 # 48000 = sample rate for opus
+                            seq_num = int.from_bytes(buf[12:18], byteorder='little')
+                            if seq_num == ChapterList[counter]:
+                                print(f"Found {counter} of {len(ChapterList)-1}",end="\r")
+                                TimeList.append(f"{int(time_seconds // 3600):02}:{int((time_seconds % 3600) // 60):02}:{(time_seconds % 60):02.03f}")
+                                counter +=1
+                                if counter >= len(ChapterList):
+                                    print(f"Found {counter-1} of {len(ChapterList)-1}")
+                                    break
+                    if counter < len(ChapterList):
+                        print("Your Toniebox has not Cached the entire file. \nExtraction of all Chapters may not be possible.")
+                    print(f"Scanning Complete.")
+                    return TimeList
+            except FileNotFoundError:
+                print(f"File {output_filename} not found.")
+            except Exception as e:
+                print(f"An error occurred: {e}")
+    
+    except FileNotFoundError:
+        print(f"File {input_filename} not found.")
+    except Exception as e:
+        print(f"An error occurred: {e}")
+
+def split_audio(input_file, chapter_list, output_dir):
+    print("Extracting files...")
+    if not os.path.exists(output_dir):
+        try:
+            os.makedirs(output_dir)
+        except Exception as e:
+            print(f"Failed to create output directory: {e}")
+            return
+
+    length = len(chapter_list)
+    for i in range(1, length):
+        print(f"Exporting file {i} of {length-1}",end="\r")
+        start_time = chapter_list[i-1]
+        end_time = chapter_list[i]
+        output_file = os.path.join(output_dir, f"Chapter_{i}.ogg")
+        try:
+            (
+            ffmpeg
+            .input(input_file, ss=start_time, to=end_time, v=8)
+            .output(output_file, codec='copy')
+            .run()
+            )
+        except ffmpeg.Error as e:
+            print(f"Error exporting file {output_file}: {e}")
+    print("\ndone")
+        
+def main(filename, output_dir):
+    try:
+        output_filename = os.path.join('/tmp', os.path.basename(filename) + ".ogg")
+        TimeList = read_and_save_binary_file_bytes(filename, output_filename, ExtractChapters(filename))
+        split_audio(output_filename, TimeList, output_dir)
+        os.remove(output_filename)
+    except Exception as e:
+        print(f"Error: {e}")
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print("Usage: python script.py <tonie_file> <output_directory>")
+        sys.exit(1)
+    
+    filename = sys.argv[1]
+    output_dir = sys.argv[2]
+    main(filename, output_dir)
@@ -0,0 +1,2 @@
+ffmpeg-python
+protobuf