Resolved create parquet file from csv

Srini

Member
Joined
Nov 15, 2022
Messages
6
Programming Experience
Beginner
First accept my apologies if it's not relevant to this group.

I am new to C#, we have a requirement to generate parquet file from csv files,
created below console program, this works ok for small files. but our files are up to 30 gb
understand this is unacceptable program I created.

Can someone suggest best performance wise solution please.

C#:
using System;
using System.Collections;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using ChoETL;
using Parquet;


namespace split_csv
{
    class Program
    {

        /// <summary>
        /// Custom File reader to enumerate the lines in a batch
        /// </summary>
        public class CustomFileReader : IEnumerable<List<string>>,
            IDisposable
        {

            // The inner stream reader object
            StreamReader sr;
            int _batchSize = 1;

            /// <summary>
            /// Constructor
            /// </summary>
            /// <param name="path">File path</param>
            /// <param name="batchSize"> Size of the batch,should be greater than 0</param>
            public CustomFileReader(string path, int batchSize)
            {
                if (batchSize > 0)
                {
                    _batchSize = batchSize;
                }
                else
                {
                    throw new ArgumentException("Batch size should be greater than Zero",
                        "batchSize");
                }
                sr = File.OpenText(path);
            }

            public void Dispose()
            {
                // close the file reader
                if (sr != null)
                {
                    sr.Close();
                }
            }

            // IEnumerable interface
            public IEnumerator<List<string>> GetEnumerator()
            {
                string input = string.Empty;

                while (!sr.EndOfStream)
                {
                    int i = 0;

                    List<string> batch = new List<string>();

                    // if not EOF, read the next line
                    while (i < _batchSize && !string.IsNullOrEmpty((input = sr.ReadLine())))
                    {
                        batch.Add(input);
                        i++;
                    }
                    if (batch.Count != 0)
                    {
                        yield return batch;
                    }
                }
                Dispose();
            }

            IEnumerator IEnumerable.GetEnumerator()
            {
                return GetEnumerator();
            }
        }

        static void Main(string[] args)
        {
            String filePath_out = "C:\\test.Parquet";
            int batch_size =100000;
            CustomFileReader reader = new CustomFileReader($"C:\\test.csv", batch_size);
            using (var writer = new ChoParquetWriter(filePath_out, new ChoParquetRecordConfiguration { CompressionMethod = Parquet.CompressionMethod.Snappy, MayContainEOLInData = true })
                     .Configure(c => c.FieldValueTrimOption = ChoFieldValueTrimOption.None)
                     .Configure(c => c.LiteParsing = true)
                     .Configure(c => c.RowGroupSize = 5000)
                  )
            {
                List<string> myList = new List<string>();
                foreach (List<string> batch in reader)
                {
                    var names = batch;
                    var joinedNames = new System.Text.StringBuilder();
                    names.ForEach(a => joinedNames.Append((joinedNames.Length > 0 ? "\n" : "") + a)); //joinedNames.ToString();
                    using (var r = ChoCSVReader.LoadText(joinedNames.ToString())
                            .Configure(c => c.NullValueHandling = ChoNullValueHandling.Empty)
                            .WithMaxScanRows(1000)
                          )
                    {
                        writer.Write(r);
                    }
                }
            }
        }
    }
}
 
Last edited by a moderator:
Based on my very quick scan (less than 3 minutes) of the CodeProject post link below, the ChoCsvReader already supports streamed reading of a CSV, so there is really no need for you to read the .CSV file in line batches to feed it into the ChoCsvReader.LoadText().

 
Back
Top Bottom