release-scripts/aws-s3-migration/aws-s3-migrate-missing-documents.py

177 lines
5.8 KiB
Python

import json
import os
import mysql.connector
import boto3
import sys
from botocore.exceptions import ClientError
import utils
# --- Configuration Loading ---
CONFIG_FILE = 'config.json'
def load_config():
"""Loads configuration from config.json."""
if not os.path.exists(CONFIG_FILE):
print(f"Error: Configuration file '{CONFIG_FILE}' not found.")
sys.exit(1)
try:
with open(CONFIG_FILE, 'r') as f:
return json.load(f)
except json.JSONDecodeError as e:
print(f"Error decoding JSON in config file: {e}")
sys.exit(1)
def fetch_documents(db_config_full):
"""
Connects to MySQL and fetches all documents (ID and S3Key).
NOTE: db_config_full includes the 'table' key, which must be filtered out for the connection.
"""
conn = None
cursor = None
# Separate connection parameters from the table name
table_name = db_config_full['table']
connection_config = {k: v for k, v in db_config_full.items() if k != 'table'}
try:
# Connect using only valid connection parameters
conn = mysql.connector.connect(**connection_config)
cursor = conn.cursor()
print(f"Fetching document IDs and S3Keys from table: {table_name}...")
# Select all ID and S3Key (or whatever column holds the key)
query = f"SELECT ID, S3Key FROM {table_name} where UploadedAt between '2025-11-01' and '2025-11-26' order by UploadedAt desc"
print(query)
cursor.execute(query)
# Fetch results as a list of dictionaries/tuples
documents = cursor.fetchall()
print(f"Found {len(documents)} documents to process.")
return documents
except mysql.connector.Error as err:
print(f"MySQL Error: {err}")
return []
finally:
if cursor:
cursor.close()
if conn and conn.is_connected():
conn.close()
def check_s3_key_exists(s3_client, bucket_name, key):
try:
# Use head_object to retrieve metadata. It's faster than list_objects.
s3_client.head_object(Bucket=bucket_name, Key=key)
return True
except ClientError as e:
# If a client error is thrown, check if it was a 404 (Not Found) error.
# If it was, then the key does not exist.
error_code = e.response['Error']['Code']
if error_code == '404':
return False
else:
# Handle other errors, like permission denied (403) or bucket not found.
print(f"Error checking key '{key}' in bucket '{bucket_name}': {e}")
# Reraise the exception for other issues
raise e
except Exception as e:
# Handle other unexpected errors
print(f"An unexpected error occurred: {e}")
raise e
def migrate_object(s3_client, aws_config, old_s3_key, doc_id):
success_count = 0
failure_count = 0
# 1. Generate new key, preserving path structure
key_prefix = aws_config['destination_key_prefix']
new_s3_key = utils.generate_new_s3_key(old_s3_key, key_prefix)
if not new_s3_key:
print(f" -> Skipping ID {doc_id}: Could not generate new key from old key.")
failure_count += 1
print(f" -> Calculated New Key: {new_s3_key}")
# 2. Move S3 object (Copy + Delete)
move_successful = utils.move_s3_object(
s3_client,
aws_config['source_bucket'],
old_s3_key,
aws_config['destination_bucket'],
new_s3_key
)
if move_successful:
# 3. Update database
config = load_config()
db_update_successful = utils.update_document_key(
config['mysql'],
doc_id,
new_s3_key
)
if db_update_successful:
success_count += 1
return new_s3_key
else:
# If DB update fails, the S3 object is MOVED. Log critical error.
print(f"CRITICAL: DB update failed for ID {doc_id}. Object is MOVED to {aws_config['destination_bucket']}/{new_s3_key}. Manual DB correction needed.")
failure_count += 1
else:
# If S3 move failed, the object remains in the source bucket.
print(f"S3 move failed for ID {doc_id}. Object remains in {aws_config['source_bucket']}/{old_s3_key}. DB not updated.")
failure_count += 1
def migrate_documents(s3_client, aws_config, documents):
source_bucket = aws_config['source_bucket']
dest_bucket = aws_config['destination_bucket']
key_prefix = aws_config['destination_key_prefix']
for doc_id, S3Key in documents:
if check_s3_key_exists(s3_client, dest_bucket, S3Key):
print(f"✅ Key '{S3Key}' exists in bucket '{dest_bucket}'.")
else:
new_s3_key = migrate_object(s3_client, aws_config, S3Key, doc_id)
if check_s3_key_exists(s3_client, dest_bucket, new_s3_key):
print(f"✅ ✅ Key '{new_s3_key}' migrated and now exists.")
else:
print(f"❌ Key '{S3Key}' does not exist after migration attempt as well .")
def main():
"""Executes the S3 migration and database update workflow."""
config = load_config()
# Initialize S3 Client
aws_config = config['aws']
try:
s3_client = boto3.client(
's3',
aws_access_key_id=aws_config['aws_access_key_id'],
aws_secret_access_key=aws_config['aws_secret_access_key'],
region_name=aws_config['aws_region']
)
print("S3 client initialized successfully.")
except Exception as e:
print(f"Failed to initialize S3 client: {e}")
return
# Fetch Documents
documents = fetch_documents(config['mysql'])
if not documents:
print("No documents found or failed to connect to the database. Exiting.")
return
else:
print(f"Documents Found: '{documents.count}'")
migrate_documents(s3_client, aws_config, documents)
if __name__ == "__main__":
main()