AWS SQS patterns for reliable background job processing
Architecture
Producer → SQS Queue (visibility_timeout >= Lambda timeout)
↓ after maxReceiveCount = 3
Dead Letter Queue → CloudWatch alarm on messages > 0
Terraform setup
resource "aws_sqs_queue" "dlq" {
name = "${var.service}-dlq"; message_retention_seconds = 1209600
}
resource "aws_sqs_queue" "worker" {
name = "${var.service}-queue"; visibility_timeout_seconds = 300
redrive_policy = jsonencode({
deadLetterTargetArn = aws_sqs_queue.dlq.arn; maxReceiveCount = 3
})
}
resource "aws_cloudwatch_metric_alarm" "dlq" {
alarm_name = "${var.service}-dlq-not-empty"
comparison_operator = "GreaterThanThreshold"; evaluation_periods = 1
metric_name = "ApproximateNumberOfMessagesVisible"; namespace = "AWS/SQS"
period = 300; statistic = "Maximum"; threshold = 0; alarm_actions = [var.alert_sns_arn]
dimensions = { QueueName = aws_sqs_queue.dlq.name }
}
Lambda with partial batch failure
def handler(event: dict, context) -> dict:
failures = []
for record in event['Records']:
try:
process_message(json.loads(record['body']))
except Exception as e:
failures.append({'itemIdentifier': record['messageId']})
return {'batchItemFailures': failures}
resource "aws_lambda_event_source_mapping" "worker" {
event_source_arn = aws_sqs_queue.worker.arn
function_name = aws_lambda_function.worker.arn
batch_size = 10
function_response_types = ["ReportBatchItemFailures"]
scaling_config { maximum_concurrency = 10 }
}
The most common mistake
visibility_timeout MUST be >= Lambda timeout
If Lambda runs 60s and visibility_timeout = 30s:
Message becomes visible again while STILL being processed
Another consumer picks it up → processed twice
Step2Dev includes SQS templates for async processing projects.
Top comments (0)