advanced-programming-techniques-task/main.py at main · bertini97/advanced-programming-techniques-task · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
import matplotlib.pyplot as plt

def analyze_human_vcf(vcf_file, output_path):
    transitions = 0
    transversions = 0
    positions = []  # Lista per salvare DOVE sono le mutazioni

    transition_pairs = {('A', 'G'), ('G', 'A'), ('C', 'T'), ('T', 'C')}

    with open(vcf_file, 'r') as f:
        for line in f:
            if line.startswith('#'): continue

            cols = line.split('\t')

            pos = int(cols[1])
            ref = cols[3]
            alt = cols[4]
            qual = float(cols[5])

            # Filtro Qualità (> 20 è standard per dati grezzi)
            if qual > 20 and len(ref) == 1 and len(alt) == 1:
                positions.append(pos)

                if (ref, alt) in transition_pairs:
                    transitions += 1
                else:
                    transversions += 1

    # Calcolo statistiche
    ratio = transitions / transversions if transversions > 0 else 0
    print(f"\n--- Risultati Cromosoma 21 ---")
    print(f"Totale Varianti trovate: {len(positions)}")
    print(f"Rapporto Ti/Tv: {ratio:.2f}")

    plt.figure(figsize=(12, 6))

    plt.hist(positions, bins=50, color='skyblue', edgecolor='black')

    plt.title('Densità delle Mutazioni sul Cromosoma 21')
    plt.xlabel('Posizione sul Cromosoma (coppie di basi)')
    plt.ylabel('Numero di Varianti')
    plt.grid(axis='y', alpha=0.5)

    # Salva il grafico
    plt.savefig(output_path)
    print("Grafico salvato come 'variant_density_chr21.png'")
    plt.show()


"""S3-based number adder script.

This script downloads a file of numbers from an S3 input prefix,
computes their sum, and uploads the result to an S3 output prefix.

Environment Variables:
    BUCKET_NAME: Name of the S3 bucket to operate on.
    INPUT_PREFIX: S3 key prefix for input files.
    OUTPUT_PREFIX: S3 key prefix for output files.
    AWS_PROFILE (optional): Name of the AWS CLI profile to use.
        If not set, boto3 uses the default credential chain.
"""

import logging
import os
import sys
import tempfile
from typing import Optional
import boto3
from botocore.exceptions import ClientError, NoCredentialsError

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
)
logger = logging.getLogger(__name__)

def _validate_s3_key(key: str) -> None:
    """Validate that an S3 key does not contain path traversal sequences.
    Args:
        key: The S3 object key to validate.
    Raises:
        ValueError: If the key contains '..' path traversal sequences.
    """
    if ".." in key:
        raise ValueError(
            f"Invalid S3 key '{key}': path traversal sequences ('..') are not allowed."
        )

class S3BucketManager:
    """Manages reading and writing files to an S3 bucket.
    This class provides methods to download files from an input prefix
    and upload files to an output prefix within a single S3 bucket.
    Attributes:
        s3: The boto3 S3 client instance.
        bucket: The name of the S3 bucket.
        input_prefix: The S3 key prefix used for reading input files.
        output_prefix: The S3 key prefix used for writing output files.
    """

    def __init__(
        self,
        bucket_name: str,
        input_prefix: str,
        output_prefix: str,
        profile_name: Optional[str] = None,
    ) -> None:
        """Initialize the S3BucketManager.
        Args:
            bucket_name: The name of the S3 bucket.
            input_prefix: The S3 key prefix for input files.
            output_prefix: The S3 key prefix for output files.
            profile_name: Optional AWS CLI profile name. If provided,
                a boto3 session is created with this profile. Otherwise,
                the default credential chain is used.
        """
        if profile_name:
            session = boto3.Session(profile_name=profile_name)
            self.s3 = session.client("s3")
        else:
            self.s3 = boto3.client("s3")
        self.bucket = bucket_name
        self.input_prefix = input_prefix
        self.output_prefix = output_prefix

    def write_data_to_output(self, local_file_path: str, s3_key: str) -> None:
        """Upload a local file to the output prefix in S3.
        Constructs the full S3 destination key by prepending the configured
        output prefix to the provided ``s3_key``, then uploads the file.
        Args:
            local_file_path: Absolute path to the local file to upload.
            s3_key: The relative S3 object key (appended to output_prefix).
        Raises:
            ValueError: If ``s3_key`` contains path traversal sequences.
            FileNotFoundError: If ``local_file_path`` does not exist.
            NoCredentialsError: If AWS credentials are not available.
        """
        _validate_s3_key(s3_key)
        try:
            destination = f"{self.output_prefix}{s3_key}"
            self.s3.upload_file(local_file_path, self.bucket, destination)
            logger.info("Uploaded %s to s3://%s/%s", local_file_path, self.bucket, destination)
        except FileNotFoundError:
            logger.error("The file was not found: %s", local_file_path)
            raise
        except NoCredentialsError:
            logger.error("AWS credentials not available")
            raise

    def read_data_from_input(self, s3_key: str, local_path: str) -> None:
        """Download a file from the input prefix in S3 to a local path.
        Constructs the full S3 source key by prepending the configured
        input prefix to the provided ``s3_key``, then downloads the file.
        Args:
            s3_key: The relative S3 object key (appended to input_prefix).
            local_path: Absolute path where the downloaded file will be saved.
        Raises:
            ValueError: If ``s3_key`` contains path traversal sequences.
            ClientError: If the S3 download fails (e.g., object not found,
                permission denied).
            NoCredentialsError: If AWS credentials are not available.
        """
        _validate_s3_key(s3_key)
        try:
            source = f"{self.input_prefix}{s3_key}"
            self.s3.download_file(self.bucket, source, local_path)
            logger.info("Downloaded s3://%s/%s to %s", self.bucket, source, local_path)
        except ClientError as e:
            logger.error("Error downloading s3://%s/%s: %s", self.bucket, s3_key, e)
            raise
        except NoCredentialsError:
            logger.error("AWS credentials not available")
            raise


class Adder:
    """Reads numbers from a file, computes their sum, and writes the result.
    Lines that cannot be parsed as numbers are skipped with a warning.
    Attributes:
        input_file_path: Path to the file containing numbers (one per line).
        output_file_path: Path to the file where the sum will be written.
    """

    def __init__(self, input_file_path: str, output_file_path: str) -> None:
        """Initialize the Adder.
        Args:
            input_file_path: Path to the input file with one number per line.
            output_file_path: Path to the file where the result will be written.
        """
        self.input_file_path = input_file_path
        self.output_file_path = output_file_path

    def add(self) -> float:
        """Read numbers from the input file, sum them, and write to the output file.
        Each line of the input file is expected to contain a single number.
        Blank lines are skipped. Lines that cannot be parsed as a float
        are logged as warnings and skipped.
        Returns:
            The computed sum of all valid numbers in the input file.
        """
        total = 0.0
        with open(self.input_file_path, "r") as f_in:
            for line_number, line in enumerate(f_in, start=1):
                stripped = line.strip()
                if not stripped:
                    continue
                try:
                    total += float(stripped)
                except ValueError:
                    logger.warning(
                        "Line %d is not a valid number: '%s' — skipping",
                        line_number,
                        stripped,
                    )
        with open(self.output_file_path, "w") as f_out:
            f_out.write(str(total))
        logger.info("Sum written to %s: %s", self.output_file_path, total)
        return total

if __name__ == "__main__":
    bucket_name = os.getenv("BUCKET_NAME")
    input_prefix = os.getenv("INPUT_PREFIX")
    output_prefix = os.getenv("OUTPUT_PREFIX")
    aws_profile = os.getenv("AWS_PROFILE")  # optional

    missing = [
        name
        for name, val in [
            ("BUCKET_NAME", bucket_name),
            ("INPUT_PREFIX", input_prefix),
            ("OUTPUT_PREFIX", output_prefix),
        ]
        if not val
    ]
    if missing:
        logger.error(
            "Missing required environment variables: %s", ", ".join(missing)
        )
        sys.exit(1)

    manager = S3BucketManager(bucket_name, input_prefix, output_prefix, profile_name=aws_profile)

    with tempfile.TemporaryDirectory() as tmp_dir_name:
        logger.info("Created temporary directory %s", tmp_dir_name)
        # Read the input file, containing the addends, from S3
        input_tmp_file_path = os.path.join(tmp_dir_name, "chr21_final.vcf")
        manager.read_data_from_input("chr21_final.vcf", input_tmp_file_path)
        # Compute the sum of the addends
        output_tmp_file_path = os.path.join(tmp_dir_name, "output.png")
        analyze_human_vcf(input_tmp_file_path, output_tmp_file_path)
        #adder = Adder(input_tmp_file_path, output_tmp_file_path)
        #adder.add()
        # Write the output file, containing the sum, to S3
        manager.write_data_to_output(output_tmp_file_path, "variant_density_chr21.png")
    # The temporary directory is automatically cleaned up when the 'with' block exits.