From 71d4ae182063ebb333d3ff97c6f521a44435b551 Mon Sep 17 00:00:00 2001 From: Vikas Nale Date: Wed, 26 Nov 2025 19:24:23 +0530 Subject: [PATCH] add dated migration for s3 --- .../aws-s3-migrate-missing-documents.py | 176 ++++++++++++++++++ aws-s3-migration/config.json | 4 +- aws-s3-migration/utils.py | 95 ++++++++++ 3 files changed, 273 insertions(+), 2 deletions(-) create mode 100644 aws-s3-migration/aws-s3-migrate-missing-documents.py create mode 100644 aws-s3-migration/utils.py diff --git a/aws-s3-migration/aws-s3-migrate-missing-documents.py b/aws-s3-migration/aws-s3-migrate-missing-documents.py new file mode 100644 index 0000000..aee3bb7 --- /dev/null +++ b/aws-s3-migration/aws-s3-migrate-missing-documents.py @@ -0,0 +1,176 @@ +import json +import os +import mysql.connector +import boto3 +import sys +from botocore.exceptions import ClientError +import utils + +# --- Configuration Loading --- + +CONFIG_FILE = 'config.json' + +def load_config(): + """Loads configuration from config.json.""" + if not os.path.exists(CONFIG_FILE): + print(f"Error: Configuration file '{CONFIG_FILE}' not found.") + sys.exit(1) + try: + with open(CONFIG_FILE, 'r') as f: + return json.load(f) + except json.JSONDecodeError as e: + print(f"Error decoding JSON in config file: {e}") + sys.exit(1) + +def fetch_documents(db_config_full): + """ + Connects to MySQL and fetches all documents (ID and S3Key). + NOTE: db_config_full includes the 'table' key, which must be filtered out for the connection. + """ + conn = None + cursor = None + + # Separate connection parameters from the table name + table_name = db_config_full['table'] + connection_config = {k: v for k, v in db_config_full.items() if k != 'table'} + + try: + # Connect using only valid connection parameters + conn = mysql.connector.connect(**connection_config) + cursor = conn.cursor() + + print(f"Fetching document IDs and S3Keys from table: {table_name}...") + + # Select all ID and S3Key (or whatever column holds the key) + query = f"SELECT ID, S3Key FROM {table_name} where UploadedAt between '2025-11-01' and '2025-11-26' order by UploadedAt desc" + print(query) + cursor.execute(query) + + # Fetch results as a list of dictionaries/tuples + documents = cursor.fetchall() + print(f"Found {len(documents)} documents to process.") + return documents + + except mysql.connector.Error as err: + print(f"MySQL Error: {err}") + return [] + + finally: + if cursor: + cursor.close() + if conn and conn.is_connected(): + conn.close() + +def check_s3_key_exists(s3_client, bucket_name, key): + + + try: + # Use head_object to retrieve metadata. It's faster than list_objects. + s3_client.head_object(Bucket=bucket_name, Key=key) + return True + except ClientError as e: + # If a client error is thrown, check if it was a 404 (Not Found) error. + # If it was, then the key does not exist. + error_code = e.response['Error']['Code'] + if error_code == '404': + return False + else: + # Handle other errors, like permission denied (403) or bucket not found. + print(f"Error checking key '{key}' in bucket '{bucket_name}': {e}") + # Reraise the exception for other issues + raise e + except Exception as e: + # Handle other unexpected errors + print(f"An unexpected error occurred: {e}") + raise e + +def migrate_object(s3_client, aws_config, old_s3_key, doc_id): + success_count = 0 + failure_count = 0 + # 1. Generate new key, preserving path structure + key_prefix = aws_config['destination_key_prefix'] + new_s3_key = utils.generate_new_s3_key(old_s3_key, key_prefix) + if not new_s3_key: + print(f" -> Skipping ID {doc_id}: Could not generate new key from old key.") + failure_count += 1 + + print(f" -> Calculated New Key: {new_s3_key}") + + # 2. Move S3 object (Copy + Delete) + move_successful = utils.move_s3_object( + s3_client, + aws_config['source_bucket'], + old_s3_key, + aws_config['destination_bucket'], + new_s3_key + ) + + if move_successful: + # 3. Update database + config = load_config() + db_update_successful = utils.update_document_key( + config['mysql'], + doc_id, + new_s3_key + ) + + if db_update_successful: + success_count += 1 + return new_s3_key + else: + # If DB update fails, the S3 object is MOVED. Log critical error. + print(f"CRITICAL: DB update failed for ID {doc_id}. Object is MOVED to {aws_config['destination_bucket']}/{new_s3_key}. Manual DB correction needed.") + failure_count += 1 + else: + # If S3 move failed, the object remains in the source bucket. + print(f"S3 move failed for ID {doc_id}. Object remains in {aws_config['source_bucket']}/{old_s3_key}. DB not updated.") + failure_count += 1 + + +def migrate_documents(s3_client, aws_config, documents): + source_bucket = aws_config['source_bucket'] + dest_bucket = aws_config['destination_bucket'] + key_prefix = aws_config['destination_key_prefix'] + + for doc_id, S3Key in documents: + if check_s3_key_exists(s3_client, dest_bucket, S3Key): + print(f"✅ Key '{S3Key}' exists in bucket '{dest_bucket}'.") + else: + new_s3_key = migrate_object(s3_client, aws_config, S3Key, doc_id) + if check_s3_key_exists(s3_client, dest_bucket, new_s3_key): + print(f"✅ ✅ Key '{new_s3_key}' migrated and now exists.") + else: + print(f"❌ Key '{S3Key}' does not exist after migration attempt as well .") + + + +def main(): + """Executes the S3 migration and database update workflow.""" + config = load_config() + + # Initialize S3 Client + aws_config = config['aws'] + try: + s3_client = boto3.client( + 's3', + aws_access_key_id=aws_config['aws_access_key_id'], + aws_secret_access_key=aws_config['aws_secret_access_key'], + region_name=aws_config['aws_region'] + ) + print("S3 client initialized successfully.") + except Exception as e: + print(f"Failed to initialize S3 client: {e}") + return + + # Fetch Documents + documents = fetch_documents(config['mysql']) + if not documents: + print("No documents found or failed to connect to the database. Exiting.") + return + else: + print(f"Documents Found: '{documents.count}'") + + migrate_documents(s3_client, aws_config, documents) + +if __name__ == "__main__": + main() diff --git a/aws-s3-migration/config.json b/aws-s3-migration/config.json index 517b819..3abfc9c 100644 --- a/aws-s3-migration/config.json +++ b/aws-s3-migration/config.json @@ -3,7 +3,7 @@ "host": "147.93.98.152", "user": "devuser", "password": "AppUser@123$", - "database": "MarcoBMSStage", + "database": "MarcoBMSProd", "table": "Documents" }, "aws": { @@ -11,7 +11,7 @@ "aws_secret_access_key": "ZEnwQuN9vWi+KodD+gvNEz46VCZ6/4ue3UKLbTTs", "aws_region": "us-east-1", "source_bucket": "testenv-marco-pms-documents", - "destination_bucket": "tryenv-marco-pms-documents", + "destination_bucket": "prodenv-marco-pms-documents", "destination_key_prefix": "", "destination_key_prefix_how": "IMPORTANT: The original S3Key (e.g., 'reports/2024/doc.pdf') will be APPENDED to this prefix, preserving the path structure. Resulting Key: 'migrated_files/reports/2024/doc.pdf'" } diff --git a/aws-s3-migration/utils.py b/aws-s3-migration/utils.py new file mode 100644 index 0000000..3330de7 --- /dev/null +++ b/aws-s3-migration/utils.py @@ -0,0 +1,95 @@ +import mysql + + +def generate_new_s3_key(old_key, prefix): + """ + Generates a new S3 key by prepending the destination prefix to the old key, + thereby preserving the original path structure. + """ + if not isinstance(old_key, str) or not old_key: + return None + + # Ensure the prefix ends with a slash if it's not empty and doesn't already end with one. + prefixed_path = prefix + if prefix and not prefix.endswith('/'): + prefixed_path += '/' + + # Combine the prefix and the old key. If the old key starts with a slash, strip it. + final_key = old_key.lstrip('/') + + return f"{prefixed_path}{final_key}" + +def move_s3_object(s3_client, source_bucket, source_key, dest_bucket, dest_key): + """ + Performs the S3 'move' operation (Copy + Delete). + + Returns True on success, False otherwise. + """ + copy_source = { + 'Bucket': source_bucket, + 'Key': source_key + } + + try: + print(f" -> Copying '{source_key}' to '{dest_key}' in '{dest_bucket}'...") + # 1. Copy the object + s3_client.copy_object( + CopySource=copy_source, + Bucket=dest_bucket, + Key=dest_key + ) + print(" -> Copy object successful.") + + # print(f" -> Deleting original object from '{source_bucket}/{source_key}'...") + # # 2. Delete the original object + # s3_client.delete_object( + # Bucket=source_bucket, + # Key=source_key + # ) + # print(" -> Delete object successful.") + return True + + except Exception as e: + print(f" -> S3 Move FAILED for {source_key}: {e}") + return False + +def update_document_key(db_config_full, doc_id, new_s3_key): + """ + Updates the S3Key for a specific document ID in the database. + NOTE: db_config_full includes the 'table' key, which must be filtered out for the connection. + """ + conn = None + cursor = None + + # Separate connection parameters from the table name + table_name = db_config_full['table'] + connection_config = {k: v for k, v in db_config_full.items() if k != 'table'} + + try: + # Connect using only valid connection parameters + conn = mysql.connector.connect(**connection_config) + cursor = conn.cursor() + + # Prepare the UPDATE query + update_query = ( + f"UPDATE {table_name} SET S3Key = %s WHERE ID = %s" + ) + + cursor.execute(update_query, (new_s3_key, doc_id)) + + # Commit the transaction to apply the changes + conn.commit() + print(f" -> DB Update SUCCESS for ID {doc_id}.") + return True + + except mysql.connector.Error as err: + print(f" -> DB Update FAILED for ID {doc_id}: {err}. Rolling back.") + if conn: + conn.rollback() + return False + + finally: + if cursor: + cursor.close() + if conn and conn.is_connected(): + conn.close() \ No newline at end of file