DEV Community

parmarjatin4911@gmail.com
parmarjatin4911@gmail.com

Posted on

Divide Large files to small using Tiktoken

Divide Large files to small using Tiktoken

import tiktoken

def create_chunks(file_path, model_name="gpt-4", max_tokens_per_chunk=1500000):
# Get the tokenizer encoding for the specified model
encoding = tiktoken.encoding_for_model(model_name)

# Read the file
with open(file_path, 'r') as file:
text = file.read()

Divide the text into chunks based on tokens

chunks = []
current_chunk = ""
current_token_count = 0

for line in text.split('\n'):
line_tokens = encoding.encode(line)
line_token_count = len(line_tokens)

if current_token_count + line_token_count > max_tokens_per_chunk:
    chunks.append(current_chunk)
    current_chunk = line + '\n'
    current_token_count = line_token_count
else:
    current_chunk += line + '\n'
    current_token_count += line_token_count
Enter fullscreen mode Exit fullscreen mode

if current_chunk:
chunks.append(current_chunk)

Save each chunk to a separate text file

for i, chunk in enumerate(chunks):
chunk_file_path = f'chunk{i+1}.txt'
with open(chunk_file_path, 'w') as chunk_file:
chunk_file.write(chunk)

Enter fullscreen mode Exit fullscreen mode




Example usage

create_chunks('songs_original.txt', model_name="gpt-4")

Top comments (0)