DEV Community: Sachin Varghese

Python compare script 7

Sachin Varghese — Thu, 19 Jun 2025 12:42:03 +0000

import os
import difflib
import re

def extract_method_names(file_path):
    """Extract method names from a Java file."""
    method_names = []
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        content = file.read()
        # Java method pattern: return_type methodName(parameters)
        # This regex looks for method declarations while avoiding comments and strings
        pattern = r'(?:public|protected|private|static|\s)+[\w\<\>\[\]]+\s+([\w]+)\s*\([\w\s,\[\]<>\.\]]*\)\s*(?:\{|[^;])'
        methods = re.findall(pattern, content)
        method_names.extend(methods)
    return method_names

def get_package_methods(repo_path):
    """Get all method names from Java files in a repository."""
    package_methods = {}
    for root, _, files in os.walk(repo_path):
        for file in files:
            if file.endswith('.java'):
                file_path = os.path.join(root, file)
                methods = extract_method_names(file_path)
                package_name = os.path.relpath(root, repo_path).replace(os.sep, '.')
                if package_name in package_methods:
                    package_methods[package_name].extend(methods)
                else:
                    package_methods[package_name] = methods
    return package_methods

def compare_methods(repo1_methods, repo2_methods):
    """Compare method names from two repositories and find methods unique to each."""
    comparison_results = {}
    all_packages = set(repo1_methods.keys()) | set(repo2_methods.keys())

    for package in all_packages:
        repo1_methods_set = set(repo1_methods.get(package, []))
        repo2_methods_set = set(repo2_methods.get(package, []))

        only_in_repo1 = repo1_methods_set - repo2_methods_set
        only_in_repo2 = repo2_methods_set - repo1_methods_set

        if only_in_repo1 or only_in_repo2:
            comparison_results[package] = {
                'only_in_repo1': sorted(list(only_in_repo1)),
                'only_in_repo2': sorted(list(only_in_repo2))
            }

    return comparison_results

def main(repo1_path, repo2_path, output_file='method_comparison_results.txt'):
    if not repo1_path or not repo2_path:
        print("Error: Repository paths must be specified")
        return

    repo1_methods = get_package_methods(repo1_path)
    repo2_methods = get_package_methods(repo2_path)
    comparison_results = compare_methods(repo1_methods, repo2_methods)

    # Write results to a text file
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write("Method Comparison Results\n")
        f.write(f"Repo 1: {repo1_path}\n")
        f.write(f"Repo 2: {repo2_path}\n")
        f.write("=" * 80 + "\n\n")

        for package, result in comparison_results.items():
            f.write(f"InComparing methods in package: {package}\n")
            f.write("-" * 50 + "\n")
            if result['only_in_repo1']:
                f.write("Methods only in Repo1:\n")
                for method in result['only_in_repo1']:
                    f.write(f"  {method}\n")
                f.write("\n")
            if result['only_in_repo2']:
                f.write("Methods only in Repo2:\n")
                for method in result['only_in_repo2']:
                    f.write(f"  {method}\n")
                f.write("\n")

    print(f"Comparison results saved to {output_file}")

if __name__ == "__main__":
    # Replace these with the actual paths to your Java repositories
    repo1_path = '/path/to/your/first/java/repo'
    repo2_path = '/path/to/your/second/java/repo'

    # Optional: Specify the output file path
    output_file = 'java_method_comparison.txt'

    main(repo1_path, repo2_path, output_file)

Python compare script 6

Sachin Varghese — Thu, 19 Jun 2025 12:26:48 +0000

import os
import difflib
import re

def extract_method_names(file_path):
    """Extract method names from a Java file."""
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        content = file.read()
    # Java method pattern: return_type methodName(parameters)
    # This regex looks for method declarations while avoiding comments and strings
    pattern = r'(?:(?:public|protected|private|static|s)[\w\s]*\s+[\w<>\[\]]+\s+([\w]+)\s*\()'
    method_names = re.findall(pattern, content)
    return method_names

def get_package_methods(repo_path):
    """Get all method names from Java files in a repository."""
    package_methods = {}
    for root, _, files in os.walk(repo_path):
        for file in files:
            if file.endswith('.java'):
                file_path = os.path.join(root, file)
                methods = extract_method_names(file_path)
                package_name = os.path.relpath(root, repo_path).replace(os.sep, '.')
                if package_name in package_methods:
                    package_methods[package_name].extend(methods)
                else:
                    package_methods[package_name] = methods
    return package_methods

def compare_methods(repo1_methods, repo2_methods):
    """Compare method names from two repositories and find methods unique to each."""
    comparison_results = {}
    all_packages = set(repo1_methods.keys()) | set(repo2_methods.keys())
    for package in all_packages:
        repo1_methods_set = set(repo1_methods.get(package, []))
        repo2_methods_set = set(repo2_methods.get(package, []))
        only_in_repo1 = repo1_methods_set - repo2_methods_set
        only_in_repo2 = repo2_methods_set - repo1_methods_set
        if only_in_repo1 or only_in_repo2:
            comparison_results[package] = {
                'only_in_repo1': sorted(list(only_in_repo1)),
                'only_in_repo2': sorted(list(only_in_repo2))
            }
    return comparison_results

def main(repo1_path, repo2_path):
    if not repo1_path or not repo2_path:
        print("Error: Repository paths must be specified")
        return
    repo1_methods = get_package_methods(repo1_path)
    repo2_methods = get_package_methods(repo2_path)
    comparison_results = compare_methods(repo1_methods, repo2_methods)
    for package, result in comparison_results.items():
        print(f"\nComparing methods in package: {package}")
        if result['only_in_repo1']:
            print("Methods only in Repo1:")
            for method in result['only_in_repo1']:
                print(f"+ {method}")
        if result['only_in_repo2']:
            print("Methods only in Repo2:")
            for method in result['only_in_repo2']:
                print(f"- {method}")

if __name__ == "__main__":
    # Replace these paths with the actual paths to your repositories
    repo1_path = ''
    repo2_path = ''
    main(repo1_path, repo2_path)

Py compare script 5

Sachin Varghese — Thu, 19 Jun 2025 08:51:32 +0000

import os
import re
import difflib

def extract_java_method_names(file_path):
    """Extract method names from a Java file."""
    method_names = []
    method_pattern = re.compile(r'^(\bpublic\b|\bprotected\b|\bprivate\b|\bstatic\b\s*)*(?=\w+\s+\w+\s*\([^)]*\))\s*\w+\s*\([^)]*\)\s*{?')
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        for line in file:
            line = line.strip()
            match = method_pattern.match(line)
            if match:
                method_name = match.group(2)
                method_names.append(method_name)
    return method_names

def get_all_method_names(repo_path):
    """Get all method names from Java files in a repository."""
    package_methods = {}
    for root, _, files in os.walk(repo_path):
        for file in files:
            if file.endswith('.java'):
                file_path = os.path.join(root, file)
                methods = extract_java_method_names(file_path)
                package_name = os.path.relpath(root, repo_path).replace(os.sep, '.')
                if package_name not in package_methods:
                    package_methods[package_name] = []
                package_methods[package_name].extend(methods)
    return package_methods

def compare_methods(repo1_methods, repo2_methods):
    """Compare method names from two repositories."""
    comparison_results = {}
    all_packages = set(repo1_methods.keys()).union(repo2_methods.keys())
    for package in all_packages:
        methods1 = set(repo1_methods.get(package, []))
        methods2 = set(repo2_methods.get(package, []))
        only_in_repo1 = methods1 - methods2
        only_in_repo2 = methods2 - methods1
        if only_in_repo1 or only_in_repo2:
            comparison_results[package] = {
                'only_in_repo1': sorted(only_in_repo1),
                'only_in_repo2': sorted(only_in_repo2)
            }
    return comparison_results

def main(repo1_path, repo2_path):
    repo1_methods = get_all_method_names(repo1_path)
    repo2_methods = get_all_method_names(repo2_path)
    comparison_results = compare_methods(repo1_methods, repo2_methods)
    for package, diffs in comparison_results.items():
        print(f"\nComparing methods in package: {package}")
        if diffs['only_in_repo1']:
            print("Methods only in Repo1:")
            for method in diffs['only_in_repo1']:
                print(f"  - {method}")
        if diffs['only_in_repo2']:
            print("Methods only in Repo2:")
            for method in diffs['only_in_repo2']:
                print(f"  - {method}")

if __name__ == "__main__":
    # Replace these paths with the actual paths to your Java repositories
    repo1_path = ""
    repo2_path = ""
    main(repo1_path, repo2_path)

Py compare script 4

Sachin Varghese — Wed, 18 Jun 2025 15:15:08 +0000

import os
import difflib


def extract_method_names(file_path):
    """Extract method names from a Python file."""
    method_names = []
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line.startswith('def '):
                # Extract method name
                method_name = line.split('(')[0][4:]  # Get the name after 'def'
                method_names.append(method_name)
    return method_names


def get_package_methods(repo_path):
    """Get all method names from Python files in a repository."""
    package_methods = {}
    for root, _, files in os.walk(repo_path):
        for file in files:
            if file.endswith('.py'):
                file_path = os.path.join(root, file)
                methods = extract_method_names(file_path)
                package_name = os.path.relpath(root, repo_path).replace(os.sep, '.')
                package_methods[package_name] = methods
    return package_methods


def compare_methods(repo1_methods, repo2_methods):
    """Compare method names from two repositories."""
    comparison_results = {}
    for package, methods in repo1_methods.items():
        if package in repo2_methods:
            repo2_method_names = repo2_methods[package]
            # Compare method names
            diff = difflib.unified_diff(
                methods,
                repo2_method_names,
                lineterm='',
                fromfile='Repo1',
                tofile='Repo2'
            )
            comparison_results[package] = list(diff)
    return comparison_results


def main(repo1_path, repo2_path):
    repo1_methods = get_package_methods(repo1_path)
    repo2_methods = get_package_methods(repo2_path)
    comparison_results = compare_methods(repo1_methods, repo2_methods)

    for package, diffs in comparison_results.items():
        print(f"Comparing methods in package: {package}")
        for line in diffs:
            print(line)


if __name__ == "__main__":
    # Replace these paths with the actual paths to your repositories
    repo1_path = '/path/to/repo1'
    repo2_path = '/path/to/repo2'
    main(repo1_path, repo2_path)

Py compare script 3

Sachin Varghese — Wed, 18 Jun 2025 15:14:20 +0000

`import os
import difflib

def extract_method_names(file_path):
"""Extract method names from a Python file."""
method_names = []
with open(file_path, 'r') as file:
for line in file:
line = line.strip()
if line.startswith('def '):
# Extract method name
method_name = line.split('(')[0][4:] # Get the name after 'def'
method_names.append(method_name)
return method_names

def get_package_methods(repo_path):
"""Get all method names from Python files in a repository."""
package_methods = {}
for root, _, files in os.walk(repo_path):
for file in files:
if file.endswith('.py'):
file_path = os.path.join(root, file)
methods = extract_method_names(file_path)
package_name = os.path.relpath(root, repo_path).replace(os.sep, '.')
package_methods[package_name] = methods
return package_methods

def compare_methods(repo1_methods, repo2_methods):
"""Compare method names from two repositories."""
comparison_results = {}
for package, methods in repo1_methods.items():
if package in repo2_methods:
repo2_method_names = repo2_methods[package]
# Compare method names
diff = difflib.unified_diff(
methods,
repo2_method_names,
lineterm='',
fromfile='Repo1',
tofile='Repo2'
)
comparison_results[package] = list(diff)
return comparison_results

def main(repo1_path, repo2_path):
repo1_methods = get_package_methods(repo1_path)
repo2_methods = get_package_methods(repo2_path)
comparison_results = compare_methods(repo1_methods, repo2_methods)

for package, diffs in comparison_results.items():
    print(f"Comparing methods in package: {package}")
    for line in diffs:
        print(line)

if name == "main":
# Replace these paths with the actual paths to your repositories
repo1_path = '/path/to/repo1'
repo2_path = '/path/to/repo2'
main(repo1_path, repo2_path)`

Py compare script 2

Sachin Varghese — Wed, 18 Jun 2025 15:13:14 +0000

import os
import difflib

def extract_method_names(file_path):
"""Extract method names from a Python file."""
method_names = []
with open(file_path, 'r') as file:
for line in file:
line = line.strip()
if line.startswith('def '): pc
# Extract method name
method_name = line.split('(')[0][4:] # Get the name after 'def'
method_names.append(method_name)
return method_names

def compare_methods(repo1_methods, repo2_methods):
"""Compare method names from two repositories."""
comparison_results = {}
for package, methods in repo1_methods.items():
if package in repo2_methods:
repo2_method_names = repo2_methods[package]
# Compare method names
diff = difflib.unified_diff(methods, repo2_method_names, lineterm='', fromfile='Repo1', tofile='Repo2')
comparison_results[package] = list(diff)
return comparison_results

if name == "main":
# Replace these paths with the actual paths to your repositories
repo1_path = '/path/to/repo1'
repo2_path = '/path/to/repo2'
main(repo1_path, repo2_path)

Py script to compare repo

Sachin Varghese — Wed, 18 Jun 2025 14:41:36 +0000

import os
import difflib

def compare_methods(repo1_methods, repo2_methods):
"""Compare method names from two repositories."""
comparison_results = {}
for package, methods in repo1_methods.items():
if package in repo2_methods:
repo2_method_names = repo2_methods[package]
# Compare method names
diff = difflib.unified_diff(methods, repo2_method_names, lineterm='', fromfile='Repo1', tofile='Repo2')
comparison_results[package] = list(diff)
return comparison_results

def main(repo1_path, repo2_path):
repo1_methods = get_package_methods(repo1_path)
repo2_methods = get_package_methods(repo2_path)
comparison_results = compare_methods(repo1_methods, repo2_methods)
for package, diffs in comparison_results.items():
print(f"Comparing methods in package: {package}")
for line in diffs:
print(line)
if name == "main":
# Replace these paths with the actual paths to your repositories
repo1_path = ''
repo2_path = ''
main(repo1_path, repo2_path)

Keep awake - ps script

Sachin Varghese — Sat, 14 Jun 2025 09:35:34 +0000

Thanks for the image — the issue is clear!

❌ Error Explained:

Method invocation failed because [System.Object[]] does not contain a method named 'op_Addition'

This means you're trying to add to a value that’s not a point object, probably because .Position is returning an array or something not cast correctly.

✅ Fix — Cast the mouse position correctly before doing math

Update your script like this:

Add-Type -AssemblyName System.Windows.Forms
Add-Type -AssemblyName System.Drawing

Clear-Host
Write-Host "...Lock screen avoider with mouse movement..."

$sleep = 30 # seconds

while ($true) {
# Get and cast current mouse position properly
$pos = [System.Windows.Forms.Cursor]::Position
$x = $pos.X
$y = $pos.Y

# Move slightly and back
[System.Windows.Forms.Cursor]::Position = New-Object System.Drawing.Point($x + 1, $y)
Start-Sleep -Milliseconds 100
[System.Windows.Forms.Cursor]::Position = New-Object System.Drawing.Point($x, $y)

Write-Host "Mouse moved to prevent lock. Waiting $sleep seconds..."
Start-Sleep -Seconds $sleep

}

✅ Why this works:

$pos.X and $pos.Y safely extract coordinates as numbers.

New-Object System.Drawing.Point(...) constructs a valid point object for cursor movement.

Avoids the invalid attempt to do math on an object array.

Let me know if you'd like:

To run it in the background silently

A scheduled task version

A .ps1 file ready to download and use

Letter01

Sachin Varghese — Wed, 23 Apr 2025 14:11:02 +0000

Here's a professional and concise email you can send to your Business Analyst to be forwarded to the client:

Subject: Clarification Required Regarding RAALL002 Script and OB Episode Error

Hi [Business Analyst's Name],

Could you please check with the client on the following points regarding the RAALL002 script and the OB Episode record error?

We are unable to replicate the issue in our environment (JEDI 2.0); it is working as expected.

RAALL002 is a pre-save rule, so it will be triggered first during the save and close of the OB Episode record. Please confirm if there are any other pre-save rules defined that may be executing before this one. (Specifically, any custom execution sequence implemented for OB Episode save/close in CareManage.)

Typically, this error occurs when attempting to insert/update the OB Episode record in the database. However, the RAALL002 script does not involve any direct database interaction, which could otherwise cause such an error.

Kindly let us know once the client confirms these details so we can proceed accordingly.

Best regards,
[Your Full Name]
[Your Job Title]
[Your Company Name]

Let me know if you'd like to add anything specific like a ticket number or reference screenshots/logs.

JSON to CSV - 01

Sachin Varghese — Wed, 23 Apr 2025 07:01:41 +0000

I'll write a Python script to convert your large JSON file to CSV format. Based on the snippet you've shared, I can see it's a complex JSON with nested structures that needs careful handling for efficient processing.

Here's a Python script optimized for handling large JSON files:

import json
import csv
import os
import gc
from itertools import islice
import ijson  # You'll need to install this: pip install ijson

def flatten_json(y):
    """Flatten a nested json structure into a single level dictionary"""
    out = {}

    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '_')
        elif type(x) is list:
            for i, a in enumerate(x):
                flatten(a, name + str(i) + '_')
        else:
            out[name[:-1]] = x

    flatten(y)
    return out

def process_json_to_csv(json_file, csv_file, batch_size=1000):
    """Process large JSON file to CSV with batching for memory efficiency"""
    # Use ijson to parse the JSON file as a stream
    with open(json_file, 'rb') as f:
        # For an array of objects
        objects = ijson.items(f, 'item')

        # Process the first object to get the headers
        first_batch = list(islice(objects, batch_size))
        if not first_batch:
            print("No data found in JSON file")
            return

        # Flatten the first batch to get all possible headers
        flattened_batch = [flatten_json(obj) for obj in first_batch]
        all_headers = set()
        for item in flattened_batch:
            all_headers.update(item.keys())

        headers = sorted(list(all_headers))

        # Write to CSV
        with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=headers)
            writer.writeheader()

            # Write the first batch
            for item in flattened_batch:
                writer.writerow(item)

            # Process the rest of the file in batches
            batch = []
            for obj in objects:
                batch.append(flatten_json(obj))

                if len(batch) >= batch_size:
                    writer.writerows(batch)
                    batch = []
                    gc.collect()  # Force garbage collection

            # Write any remaining items
            if batch:
                writer.writerows(batch)

def main():
    json_file = 'your_large_file.json'  # Replace with your JSON file path
    csv_file = 'output.csv'

    print(f"Converting {json_file} to {csv_file}...")
    process_json_to_csv(json_file, csv_file)
    print(f"Conversion complete! Output saved to {csv_file}")
    print(f"CSV file size: {os.path.getsize(csv_file) / (1024 * 1024):.2f} MB")

if __name__ == "__main__":
    main()

Key optimization techniques used in this script:

Stream parsing with ijson: Instead of loading the entire JSON into memory, ijson parses the file as a stream, which is crucial for large files.
Batch processing: Processing the data in batches (1000 records at a time) to control memory usage.
Flattening nested structures: The flatten_json function converts nested JSON objects into a flat structure suitable for CSV.
Garbage collection: Explicitly calling gc.collect() after processing each batch to free up memory.
Optimized CSV writing: Using DictWriter for efficient CSV creation.

You may need to adjust the script based on the exact structure of your JSON. If your JSON is not an array at the root level but has a different structure, you'll need to modify the ijson path in ijson.items(f, 'item') to match your structure.

Also, if your JSON has deeply nested structures, the flattened column names could get very long. You might want to customize the flatten_json function to handle the naming scheme differently if needed.

Would you like me to explain any specific part of this script in more detail?

Py txt to CSV - 02

Sachin Varghese — Tue, 22 Apr 2025 17:03:07 +0000

The provided code is a well-structured script for converting a large text file containing concatenated JSON objects into a CSV file using Python, pandas, and a streaming approach to handle memory efficiently. Below is a detailed review of the code, including its strengths, potential issues, and suggestions for improvement.

Strengths

Memory Efficiency with Streaming:
- The stream_json_objects function reads the file in chunks (64KB) and processes JSON objects incrementally, avoiding loading the entire file into memory. This is critical for handling large files.
- The use of a buffer and regex (separator_re) to split concatenated JSON objects is robust for handling objects split across chunk boundaries.
Batch Processing:
- The json_to_csv_optimized function processes JSON objects in batches (batch_size=10000), normalizing them into a pandas DataFrame and writing to CSV incrementally. This balances memory usage and performance.
Error Handling:
- The code includes comprehensive error handling for:
  - File not found (FileNotFoundError).
  - JSON parsing errors (json.JSONDecodeError).
  - General exceptions during file processing or normalization.
- Warnings are printed with useful context (e.g., buffer snippets) to aid debugging.
Column Consistency:
- The script ensures consistent column headers across batches by determining columns from the first valid batch and reindexing subsequent batches to match. This prevents misaligned CSV output.
Clean File Management:
- Removes the default Excel file (result.xlsx) if it exists, avoiding confusion from previous runs.
- Uses 'a' (append) mode for CSV writing after the header is written, ensuring efficient file operations.
Modularity:
- The code is split into two clear functions: stream_json_objects for parsing and json_to_csv_optimized for conversion, making it reusable and maintainable.

Potential Issues and Suggestions

While the code is robust, there are a few areas where it could be improved or where edge cases might cause issues.

1. Edge Case: Malformed JSON Objects

Issue: If the input file contains malformed JSON objects or unexpected separators (e.g., }{ inside a string value), the stream_json_objects function skips the invalid segment but might lose data. The warning message helps, but it doesn’t allow for recovery of partial valid objects.

Suggestion: Add an option to log skipped segments to a separate file for post-processing or manual inspection. For example:

 def stream_json_objects(filepath, log_file=None):
     if log_file:
         log = open(log_file, 'w', encoding='utf-8')
     try:
         # ... existing code ...
         except json.JSONDecodeError as e:
             print(f"Warning: JSONDecodeError at position {match.start()+1}: {e}")
             if log_file:
                 log.write(f"Error at position {match.start()+1}: {potential_obj_str}\n")
             last_processed_index = match.end()
     finally:
         if log_file:
             log.close()

Usage: stream_json_objects(json_file_path, log_file='skipped_segments.txt').

2. Performance with Large JSON Objects

Issue: If individual JSON objects are very large (e.g., megabytes each), the 64KB chunk size may result in frequent buffer resizing and incomplete object parsing, slowing down processing.
Suggestion: Make the chunk size configurable to allow tuning based on the expected JSON object size:
```
 def stream_json_objects(filepath, chunk_size=65536):
     # ... use chunk_size in f.read(chunk_size) ...
```
Then call: stream_json_objects(json_file_path, chunk_size=1048576) for larger objects (e.g., 1MB).

3. Separator Regex Limitations

Issue: The regex }[ \t\r\n]*{ assumes JSON objects are separated by optional whitespace. If the file uses a different separator (e.g., commas, newlines only, or no separator), parsing will fail or produce incorrect splits.
Suggestion: Add flexibility to handle different separator patterns or detect them dynamically. For example:
```
 def stream_json_objects(filepath, separator_pattern=r"}[ \t\r\n]*{"):
     separator_re = re.compile(separator_pattern)
     # ... rest of the function ...
```
Alternatively, add a preprocessing step to detect the separator by scanning the first few KB of the file.

4. Empty or Invalid File Handling

Issue: If the input file is empty or contains no valid JSON objects, the script correctly reports “No valid JSON objects were parsed” but still creates an empty result.csv if any batch was attempted. This could be confusing.

Suggestion: Check if any valid objects were processed before creating the CSV file, or clean up the empty CSV file:

 if not header_written and all_columns is None:
     print("No valid JSON objects were parsed. No output CSV was created.")
     if os.path.exists(csv_file):
         os.remove(csv_file)
         print(f"Removed empty CSV file: {csv_file}")

5. CSV Encoding and Delimiter Flexibility

Issue: The CSV output is hardcoded to use UTF-8 encoding and the default pandas delimiter (,). Some systems or datasets may require different encodings (e.g., UTF-16) or delimiters (e.g., ;, \t).

Suggestion: Add parameters for encoding and delimiter:

 def json_to_csv_optimized(json_file, csv_file='result.csv', batch_size=5000, encoding='utf-8', delimiter=','):
     # ... in df.to_csv calls ...
     df.to_csv(csv_file, index=False, mode='w', header=True, encoding=encoding, sep=delimiter)
     # ... and in append mode ...
     df.to_csv(csv_file, index=False, mode='a', header=False, encoding=encoding, sep=delimiter)

Usage: json_to_csv_optimized(json_file_path, csv_output_path, encoding='utf-16', delimiter=';').

6. Progress Feedback

Issue: For very large files, the user only gets feedback per batch (e.g., every 10,000 objects). Long-running processes could benefit from more granular progress updates.

Suggestion: Add a counter for total objects processed and print progress every N objects:

 total_objects = 0
 for i, obj in enumerate(stream_json_objects(json_file)):
     total_objects += 1
     batch_data.append(obj)
     if total_objects % 1000 == 0:
         print(f"Processed {total_objects} objects...")
 # ... rest of the function ...
 print(f"Total objects processed: {total_objects}")

7. Commented-Out Excel Code

Issue: The commented-out json_to_excel function and openpyxl import are remnants of the original script. While harmless, they add clutter and could confuse maintainers.
Suggestion: Remove the commented-out code and the openpyxl import unless there’s a plan to reintroduce Excel support. If Excel output is needed, consider implementing it as an optional output format in json_to_csv_optimized with a parameter (e.g., output_format='csv' or 'excel').

8. Batch Size Tuning

Issue: The default batch_size=10000 may be too large for systems with limited memory or too small for very simple JSON objects, affecting performance.
Suggestion: Provide guidance in the docstring or add a dynamic batch size adjustment based on memory usage or object complexity. Alternatively, make it easier to tune via a command-line argument or config:
```
 json_to_csv_optimized(json_file_path, csv_output_path, batch_size=5000)  # Smaller batch for low-memory systems
```

9. Unused `io` Import

Issue: The io module is imported but not used in the provided code. This is minor but could indicate an oversight or leftover from earlier versions.
Suggestion: Remove the import io line unless there’s a specific plan to use it (e.g., for in-memory buffering).

10. Documentation and Type Hints

Issue: The docstrings are clear, but they could be enhanced with return types and parameter types for better IDE support and maintainability.

Suggestion: Add type hints and improve docstrings:

 from typing import Generator, Dict, List, Union
 import pandas as pd

 def stream_json_objects(filepath: str, chunk_size: int = 65536) -> Generator[Union[Dict, List], None, None]:
     """
     Streams JSON objects from a text file containing concatenated objects.
     Args:
         filepath: Path to the input text file.
         chunk_size: Size of chunks to read from file (in bytes).
     Yields:
         Parsed Python dictionary or list per JSON object.
     """
     # ... function body ...

 def json_to_csv_optimized(json_file: str, csv_file: str = 'result.csv', batch_size: int = 5000, encoding: str = 'utf-8') -> None:
     """
     Converts a text file with concatenated JSON objects to a CSV file.
     Args:
         json_file: Path to the input text file.
         csv_file: Path for the output CSV file.
         batch_size: Number of JSON objects to process per batch.
         encoding: Encoding for the output CSV file.
     """
     # ... function body ...

Specific Code Fixes

Here’s a consolidated version of the suggested changes applied to the code, keeping it concise:

import pandas as pd
import json
import re
import os
from typing import Generator, Dict, List, Union

def stream_json_objects(filepath: str, chunk_size: int = 65536) -> Generator[Union[Dict, List], None, None]:
    """
    Streams JSON objects from a text file containing concatenated objects.
    Args:
        filepath: Path to the input text file.
        chunk_size: Size of chunks to read from file (in bytes).
    Yields:
        Parsed Python dictionary or list per JSON object.
    """
    buffer = ""
    separator_re = re.compile(r"}[ \t\r\n]*{")
    last_processed_index = 0

    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            while True:
                chunk = f.read(chunk_size)
                if not chunk:
                    break
                buffer += chunk
                for match in separator_re.finditer(buffer, last_processed_index):
                    potential_obj_str = buffer[last_processed_index : match.start() + 1]
                    try:
                        obj = json.loads(potential_obj_str)
                        yield obj
                        last_processed_index = match.end()
                    except json.JSONDecodeError as e:
                        print(f"Warning: JSONDecodeError at position {match.start()+1}: {e}")
                        print(f"Segment snippet (first 200 chars): {potential_obj_str[:200]}...")
                        last_processed_index = match.end()
                buffer = buffer[last_processed_index:]
                last_processed_index = 0
            buffer = buffer.strip()
            if buffer:
                try:
                    obj = json.loads(buffer)
                    yield obj
                except json.JSONDecodeError as e:
                    print(f"Warning: Could not parse remaining buffer: {e}")
                    print(f"Remaining buffer snippet (first 200 chars): {buffer[:200]}...")
    except FileNotFoundError:
        print(f"Error: File not found at {filepath}")
    except Exception as e:
        print(f"An error occurred: {e}")

def json_to_csv_optimized(json_file: str, csv_file: str = 'result.csv', batch_size: int = 5000, encoding: str = 'utf-8') -> None:
    """
    Converts a text file with concatenated JSON objects to a CSV file.
    Args:
        json_file: Path to the input text file.
        csv_file: Path for the output CSV file.
        batch_size: Number of JSON objects to process per batch.
        encoding: Encoding for the output CSV file.
    """
    if os.path.exists(csv_file):
        try:
            os.remove(csv_file)
            print(f"Removed existing CSV file: {csv_file}")
        except OSError as e:
            print(f"Error removing {csv_file}: {e}")

    all_columns = None
    header_written = False
    batch_data = []
    total_objects = 0

    print(f"Starting conversion of '{json_file}' to '{csv_file}'...")
    try:
        for i, obj in enumerate(stream_json_objects(json_file)):
            total_objects += 1
            batch_data.append(obj)
            if total_objects % 1000 == 0:
                print(f"Processed {total_objects} objects...")

            if len(batch_data) >= batch_size:
                try:
                    df = pd.json_normalize(batch_data)
                    if all_columns is None:
                        all_columns = list(df.columns)
                        df.to_csv(csv_file, index=False, mode='w', header=True, encoding=encoding)
                        header_written = True
                        print(f"Processed batch {i // batch_size + 1}. Header written.")
                    else:
                        df = df.reindex(columns=all_columns, fill_value=None)
                        df.to_csv(csv_file, index=False, mode='a', header=False, encoding=encoding)
                        print(f"Processed batch {i // batch_size + 1}.")
                    batch_data = []
                except Exception as e:
                    print(f"Warning: Error normalizing batch starting at object {i - len(batch_data) + 1}: {e}")
                    batch_data = []

        if batch_data:
            try:
                df = pd.json_normalize(batch_data)
                if all_columns is None:
                    all_columns = list(df.columns)
                    df.to_csv(csv_file, index=False, mode='w', header=True, encoding=encoding)
                    header_written = True
                    print(f"Processed remaining {len(batch_data)} objects. Header written.")
                else:
                    df = df.reindex(columns=all_columns, fill_value=None)
                    df.to_csv(csv_file, index=False, mode='a', header=False, encoding=encoding)
                    print(f"Processed remaining {len(batch_data)} objects.")
            except Exception as e:
                print(f"Warning: Error normalizing final batch: {e}")

        if not header_written and all_columns is None:
            print("No valid JSON objects were parsed. No output CSV created.")
            if os.path.exists(csv_file):
                os.remove(csv_file)
                print(f"Removed empty CSV file: {csv_file}")
        elif header_written:
            print(f"Data written successfully to '{csv_file}'")
        print(f"Total objects processed: {total_objects}")

    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# Example usage
json_file_path = "sample2.txt"
csv_output_path = "result.csv"
json_to_csv_optimized(json_file_path, csv_output_path, batch_size=10000, encoding='utf-8')

Key Changes in the Fixed Version

Removed unused io and commented-out openpyxl code.
Added type hints for better code clarity.
Made chunk size configurable in stream_json_objects.
Added progress feedback every 1000 objects.
Ensured empty CSV files are removed if no valid objects are parsed.
Added encoding parameter to json_to_csv_optimized.
Improved docstrings with type information.
Removed the Excel file cleanup logic since it’s no longer relevant (replaced with CSV cleanup).

Testing Recommendations

To ensure the code works as expected, test it with the following scenarios:

Valid Input: A large file with correctly formatted JSON objects separated by }{ or whitespace.
Malformed JSON: A file with some invalid JSON objects to verify error handling and skipping.
Empty File: An empty file to check that no CSV is created.
Single Large Object: A file with one very large JSON object to test chunk size handling.
Different Separators: A file with JSON objects separated by commas or newlines to test regex robustness (may require modifying the separator pattern).
Low Memory: Run on a system with limited memory to tune batch_size and chunk_size.

Conclusion

The code is well-designed for its purpose, with strong memory efficiency and error handling. The suggested improvements enhance its flexibility, robustness, and usability without significantly altering its core functionality. The fixed version incorporates these changes and is ready for use with large JSON files. If you have specific requirements (e.g., support for Excel output, different separators), let me know, and I can tailor the code further!

py txt to CSV - 01

Sachin Varghese — Tue, 22 Apr 2025 16:28:54 +0000

For handling large text files (~25 MB or 500,000 lines) containing JSON data, we need to optimize the script to process the file efficiently, minimize memory usage, and improve performance. Using a CSV file instead of an Excel file is a good choice for large datasets because CSV files are simpler, faster to write, and consume less memory. Below is a modified version of your script that incorporates optimizations for large files, uses streaming for JSON parsing, and outputs to a CSV file.

Key Optimizations

Streaming JSON Parsing: Use ijson to parse JSON objects iteratively, avoiding loading the entire file into memory.
Chunked Processing: Process the data in chunks to manage memory usage when converting to a DataFrame and writing to CSV.
CSV Output: Write to a CSV file instead of Excel for faster performance and lower memory usage.
Generator-Based Processing: Use generators to yield JSON objects one at a time, reducing memory overhead.
Efficient File Reading: Read the file line by line or in chunks to handle large files efficiently.
Avoid Regex for Splitting: Instead of using re.sub to split JSON objects, rely on streaming parsing to handle object boundaries.

Modified Script

import pandas as pd
import ijson
import csv
import os
from itertools import islice

def json_to_csv(json_file, csv_file='result.csv', chunk_size=10000):
    """
    Convert a large JSON file to CSV efficiently using streaming and chunked processing.

    Args:
        json_file (str): Path to the input JSON file.
        csv_file (str): Path to the output CSV file.
        chunk_size (int): Number of records to process in each chunk.
    """
    # Check if file exists
    if not os.path.exists(json_file):
        print(f"Error: File {json_file} does not exist.")
        return

    # Initialize CSV file with headers
    first_record = True
    records = []

    # Use ijson to stream JSON objects
    with open(json_file, 'rb') as file:  # Binary mode for ijson
        try:
            # Parse JSON objects iteratively
            parser = ijson.items(file, 'item')  # Assumes JSON array of objects
            for obj in parser:
                records.append(obj)

                # Process records in chunks
                if len(records) >= chunk_size:
                    write_chunk(records, csv_file, first_record)
                    first_record = False
                    records = []  # Clear records after writing

        except ijson.JSONError as e:
            print(f"JSON parsing error: {e}. Attempting line-by-line parsing...")
            # Fallback: Parse line by line for malformed JSON
            file.seek(0)  # Reset file pointer
            records = []
            current_obj = ""
            for line in file:
                line = line.decode('utf-8').strip()  # Decode from bytes to string
                if line:
                    current_obj += line
                    try:
                        import json
                        parsed = json.loads(current_obj)
                        records.append(parsed)
                        current_obj = ""
                    except json.JSONDecodeError:
                        continue

                # Process records in chunks
                if len(records) >= chunk_size:
                    write_chunk(records, csv_file, first_record)
                    first_record = False
                    records = []

    # Write any remaining records
    if records:
        write_chunk(records, csv_file, first_record)

    print(f"Data written successfully to {csv_file}")

def write_chunk(records, csv_file, first_record):
    """
    Write a chunk of records to the CSV file.

    Args:
        records (list): List of JSON objects to write.
        csv_file (str): Path to the output CSV file.
        first_record (bool): Whether this is the first chunk (to write headers).
    """
    # Convert records to DataFrame
    df = pd.json_normalize(records)

    # Write to CSV
    mode = 'w' if first_record else 'a'
    header = first_record
    df.to_csv(csv_file, mode=mode, header=header, index=False, encoding='utf-8')

# Example usage
json_file_path = "sample2.txt"
json_to_csv(json_file_path, csv_file='result.csv', chunk_size=10000)

Explanation of Changes and Optimizations

Use of ijson:
- The ijson library is used to parse JSON objects iteratively, which is critical for large files. It avoids loading the entire file into memory.
- The items method with 'item' assumes the JSON file is an array of objects (e.g., [{}, {}, ...]). If your JSON file has a different structure (e.g., newline-delimited JSON), adjust the parsing logic accordingly.
Chunked Processing:
- Records are collected in chunks of chunk_size (default: 10,000) before being converted to a DataFrame and written to CSV. This balances memory usage and performance.
- The write_chunk function handles writing each chunk to the CSV file, ensuring headers are written only for the first chunk.
CSV Output:
- CSV is used instead of Excel because it’s faster and more memory-efficient for large datasets. Libraries like pandas write to CSV more efficiently than to Excel formats like XLSX.
- The to_csv method is used with mode='a' (append) for subsequent chunks to avoid overwriting the file.
Fallback for Malformed JSON:
- If ijson fails (e.g., due to malformed JSON), the script falls back to line-by-line parsing, similar to your original script, but optimized for memory by processing chunks.
Binary File Reading:
- The file is opened in binary mode ('rb') for ijson, which is more efficient for large files. For the fallback method, lines are decoded to strings as needed.
Error Handling:
- Checks if the input file exists.
- Handles JSON parsing errors gracefully with a fallback mechanism.
Configurable Chunk Size:
- The chunk_size parameter allows you to tune the number of records processed at a time based on your system’s memory capacity. A larger chunk_size reduces I/O overhead but increases memory usage.

Prerequisites

Install the required libraries:

pip install pandas ijson openpyxl

Notes

JSON Structure: The script assumes the JSON file is either an array of objects or newline-delimited JSON (one object per line). If your JSON has a different structure, you may need to adjust the ijson.items call or the fallback parsing logic.
Performance Tuning: Adjust chunk_size based on your system’s memory. For a system with 8 GB RAM, a chunk_size of 10,000–50,000 is typically reasonable.
Output Size: CSV files are generally smaller than Excel files, but ensure you have enough disk space for the output (~25 MB input could produce a larger CSV depending on the data).
Excel Alternative: If you still need Excel output, you can modify the write_chunk function to use df.to_excel with engine='openpyxl' and append mode (using ExcelWriter), but this will be slower and more memory-intensive.

Example Input and Output

Input (sample2.txt):

[{"id": 1, "name": "Alice", "age": 30}, {"id": 2, "name": "Bob", "age": 25}]

or newline-delimited:

{"id": 1, "name": "Alice", "age": 30}
{"id": 2, "name": "Bob", "age": 25}

Output (result.csv):

id,name,age
1,Alice,30
2,Bob,25

This script should handle large files efficiently while maintaining robustness and flexibility. Let me know if you need further tweaks or have a specific JSON structure to accommodate!