September 13, 2017

CSV Files with Linq

An example of using Linq to parse a CSV file and write data to a TSV file. The beauty of the approach shown here is the file is read one line at a time when parsing, you do not have to load the whole file to process the data, and written one line at a time when writing. Here is a sample piece of the CSV file (note the first line is a header line not a data line):
id,edsm_id,name,x,y,z,population,is_populated,government_id,government,allegiance_id,allegiance,state_id,state,security_id,security,primary_economy_id,primary_economy,power,power_state,power_state_id,needs_permit,updated_at,simbad_ref,controlling_minor_faction_id,controlling_minor_faction,reserve_type_id,reserve_type
17,60,"10 Ursae Majoris",0.03125,34.90625,-39.09375,0,0,176,None,5,None,80,None,16,Low,10,None,,,,0,1497906646,"10 Ursae Majoris",,,,
24,12009,"11 Bootis",-49.40625,285.25,65.21875,0,0,176,None,5,None,80,None,16,Low,10,None,,,,0,1474116394,"11 Bootis",,,,
26,13308,"11 Mu Aurigae",-30,0.75,-150.03125,0,0,176,None,5,None,80,None,16,Low,10,None,,,,0,1497806946,,,,,


Want to parse some of these lines into this structure:
public class BaseSystemRaw
{
    public int id { get; set; }
    public string name { get; set; }
    public float x { get; set; }
    public float y { get; set; }
    public float z { get; set; }

    public override string ToString()
    {
        return "id=" + id + ", name=" + name + 
               ", location=( " + x + ", " + y + ", " + z + " )";
    }
}
Here is a test harness:
[ TestFixture ]
class ParseCsvFilesWithLinqTests
{
    [ Test ]
    public void ParseCsvFileTest()
    {
        // Skip(1) will skip the header line
        // This commented out version will only use the first 10 lines,
        // great for debugging
        // var baseSystemRaws = File.ReadLines(TestFilePaths.SystemsFilePath).
        //     Take(10).Skip(1).Select(line => ExtractBaseSystemRaw(line));
        var baseSystemRaws = File.ReadLines( TestFilePaths.SystemsFilePath ).
                Skip( 1 ).Select( line => ExtractBaseSystemRaw( line ) );
        DoSomethingWith( baseSystemRaws );
    }

    BaseSystemRaw ExtractBaseSystemRaw(
        string line )
    {
        var parts = line.Split( ',' );
        var sysRaw = new BaseSystemRaw();
        int tmp;
        if ( int.TryParse( parts[ 0 ], out tmp ) )
        {
            sysRaw.id = tmp;
        }
        // Ignore parts[1] edsm_id
        sysRaw.name = parts[ 2 ].Trim().Replace( "\"", "" );
        float ftmp;
        if ( float.TryParse( parts[ 3 ], out ftmp ) )
        {
            sysRaw.x = ftmp;
        }
        if ( float.TryParse( parts[ 4 ], out ftmp ) )
        {
            sysRaw.y = ftmp;
        }
        if ( float.TryParse( parts[ 5 ], out ftmp ) )
        {
            sysRaw.z = ftmp;
        }
        // Ignore other parts
        return sysRaw;
    }

    private void DoSomethingWith(
        IEnumerable<BaseSystemRaw> baseSystemRaws )
    {
        foreach ( var entry in baseSystemRaws )
        {
            Trace.WriteLine( entry );
        }
    }
}
Now write the data to a TSV file.
[ TestFixture ]
class WriteTsvFilesWithLinqTests
{
    [ Test ]
    public void WriteTsvFileTest()
    {
        const string separator = "\t";
        var headers = new[] { "Id", "System", "x", "y", "z" };
        // Skip(1) will skip the header line
        // This version will only use the first 4 lines, great for debugging, 
        // remove the Take(4) to process the whole file
        var baseSystemRaws = File.ReadLines( TestFilePaths.SystemsFilePath ).
            Take(4).Skip(1).Select( line => ExtractBaseSystemRaw( line ) );
        
        // Use Enumerable.Concat to add the header string
        File.WriteAllLines( TestFilePaths.BaseSystemsTsvFilePath,
            Enumerable.Concat( new[] { string.Join( separator, headers ) },
                baseSystemRaws.Select(sys => sys != null ? 
                    CoreSystemRawToTsv( separator, sys ) : "")));
    }

    private string CoreSystemRawToTsv(
        BaseSystemRaw sys )
    {
        string line = string.Join( "\t", sys.id, sys.name, sys.x.ToString( "0.0000" ),
            sys.y.ToString( "0.0000" ), sys.z.ToString( "0.0000" ) );
        return line;
    }
}

No comments: