<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:dc="http://purl.org/dc/elements/1.1/">
  <channel>
    <title>DEV Community: Pooja-Honey</title>
    <description>The latest articles on DEV Community by Pooja-Honey (@poojahoney).</description>
    <link>https://dev.to/poojahoney</link>
    <image>
      <url>https://media2.dev.to/dynamic/image/width=90,height=90,fit=cover,gravity=auto,format=auto/https:%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F905059%2F85a923e7-1893-4127-b4b6-6e5d5f095e89.jpeg</url>
      <title>DEV Community: Pooja-Honey</title>
      <link>https://dev.to/poojahoney</link>
    </image>
    <atom:link rel="self" type="application/rss+xml" href="https://dev.to/feed/poojahoney"/>
    <language>en</language>
    <item>
      <title>AWS S3 Upload large excel file</title>
      <dc:creator>Pooja-Honey</dc:creator>
      <pubDate>Mon, 08 Aug 2022 04:27:34 +0000</pubDate>
      <link>https://dev.to/poojahoney/aws-s3-upload-large-excel-file-1enj</link>
      <guid>https://dev.to/poojahoney/aws-s3-upload-large-excel-file-1enj</guid>
      <description>&lt;p&gt;Could anyone help me out, to upload a large excel file to s3 in multipart upload and read the data from uploaded multipart file.&lt;/p&gt;

&lt;p&gt;Currently I am doing like upload the file in multipart by splitting the file bytes by converting them into pandas dataframes using numpy array. Is this the way is correct or is there any alternatives. because it is taking much time to convert the bytes in to pandas to do the multipart upload.&lt;/p&gt;

&lt;p&gt;Following is my code:&lt;br&gt;
 def multipart_upload(self, filename: str, user_settings: UserSettings, model: RootModel, location: str, content_buffer: Any,&lt;br&gt;
                         content_df: pd.DataFrame, content_type: str):&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;    s3_client = get_boto3_s3_client()
    chunksize = 5 * 1024 * 1024

    part_number = 0
    chunk: pd.DataFrame
    parts_info = []

    key_name = S3UtilsBase().prepare_s3_key_path(filename=filename,
                                                 location=location,
                                                 model=model,
                                                 user_settings=user_settings)

    multipart_upload_resp = s3_client.create_multipart_upload(
        Bucket=settings.AWS_S3_BUCKET, Key=key_name)

    for chunk in np.array_split(content_df, len(content_buffer.getvalue()) // chunksize):
        buffer = io.BytesIO()
        part_number = part_number + 1
        excel_file_types = S3UtilsBase().get_excel_types()
        if content_type == 'csv':
            chunk.to_csv(buffer)
        elif content_type in excel_file_types:
            chunk.to_excel(buffer)

        chunk_resp = s3_client.upload_part(Bucket=settings.AWS_S3_BUCKET,
                                           Key=multipart_upload_resp['Key'],
                                           PartNumber=part_number,
                                           UploadId=multipart_upload_resp['UploadId'],
                                           Body=buffer.getvalue())

        parts_info.append({
            'PartNumber': part_number,
            'ETag': chunk_resp['ETag']
        })

    parts_info = sorted(parts_info, key=lambda x: x["PartNumber"])
    cmp_multipart_upload_resp = s3_client.complete_multipart_upload(Bucket=settings.AWS_S3_BUCKET,
                                                                    Key=multipart_upload_resp['Key'],
                                                                    UploadId=multipart_upload_resp['UploadId'],
                                                                    MultipartUpload={"Parts": parts_info})

    paths = cmp_multipart_upload_resp["Key"].split("/")
    separator = "/"
    prefix = separator.join(paths[:-1])
    return AttachmentResponseFromS3(
        Prefix=prefix,
        Error=False,
        Message='',
        Version=cmp_multipart_upload_resp["VersionId"],
        Host=f'{AWS_S3}://{cmp_multipart_upload_resp["Bucket"]}',
        FileName=paths[-1],
        Parts=part_number
    )
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;

</description>
    </item>
  </channel>
</rss>
