First accept my apologies if it's not relevant to this group.
I am new to C#, we have a requirement to generate parquet file from csv files,
created below console program, this works ok for small files. but our files are up to 30 gb
understand this is unacceptable program I created.
Can someone suggest best performance wise solution please.
I am new to C#, we have a requirement to generate parquet file from csv files,
created below console program, this works ok for small files. but our files are up to 30 gb
understand this is unacceptable program I created.
Can someone suggest best performance wise solution please.
C#:
using System;
using System.Collections;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using ChoETL;
using Parquet;
namespace split_csv
{
class Program
{
/// <summary>
/// Custom File reader to enumerate the lines in a batch
/// </summary>
public class CustomFileReader : IEnumerable<List<string>>,
IDisposable
{
// The inner stream reader object
StreamReader sr;
int _batchSize = 1;
/// <summary>
/// Constructor
/// </summary>
/// <param name="path">File path</param>
/// <param name="batchSize"> Size of the batch,should be greater than 0</param>
public CustomFileReader(string path, int batchSize)
{
if (batchSize > 0)
{
_batchSize = batchSize;
}
else
{
throw new ArgumentException("Batch size should be greater than Zero",
"batchSize");
}
sr = File.OpenText(path);
}
public void Dispose()
{
// close the file reader
if (sr != null)
{
sr.Close();
}
}
// IEnumerable interface
public IEnumerator<List<string>> GetEnumerator()
{
string input = string.Empty;
while (!sr.EndOfStream)
{
int i = 0;
List<string> batch = new List<string>();
// if not EOF, read the next line
while (i < _batchSize && !string.IsNullOrEmpty((input = sr.ReadLine())))
{
batch.Add(input);
i++;
}
if (batch.Count != 0)
{
yield return batch;
}
}
Dispose();
}
IEnumerator IEnumerable.GetEnumerator()
{
return GetEnumerator();
}
}
static void Main(string[] args)
{
String filePath_out = "C:\\test.Parquet";
int batch_size =100000;
CustomFileReader reader = new CustomFileReader($"C:\\test.csv", batch_size);
using (var writer = new ChoParquetWriter(filePath_out, new ChoParquetRecordConfiguration { CompressionMethod = Parquet.CompressionMethod.Snappy, MayContainEOLInData = true })
.Configure(c => c.FieldValueTrimOption = ChoFieldValueTrimOption.None)
.Configure(c => c.LiteParsing = true)
.Configure(c => c.RowGroupSize = 5000)
)
{
List<string> myList = new List<string>();
foreach (List<string> batch in reader)
{
var names = batch;
var joinedNames = new System.Text.StringBuilder();
names.ForEach(a => joinedNames.Append((joinedNames.Length > 0 ? "\n" : "") + a)); //joinedNames.ToString();
using (var r = ChoCSVReader.LoadText(joinedNames.ToString())
.Configure(c => c.NullValueHandling = ChoNullValueHandling.Empty)
.WithMaxScanRows(1000)
)
{
writer.Write(r);
}
}
}
}
}
}
Last edited by a moderator: