Source code for cytocalc.csmparser

#! /usr/bin/env python3
# Contains classes CSMParser (for parsing trajectory file: report####.txt)
# and CYMParser (for parsing simulation parameters: configurations.cym)

import os
import re
import io
import warnings
import sys
from cytocalc.csmframe import CSMFrame
from cytocalc.csmsimulation import CSMSimulation
import pandas as pd


[docs]
class CSMParser:
    """
    Parses the cytosim trajectory file for a given frame
    Returns an extended DataFrame object (CSMFrame)
    """
    def __init__(self):
        pass

[docs]
    def parse_frameFile(self,filename):
        with open(filename, 'r') as frame:
            csmframe = self.parse_frame(frame)
        return csmframe



[docs]
    def parse_simFile(self,filename):
        """
        Parses cytosim trajectory file generated using `report`

        Returns a CSMSimulation
        """
        sim = CSMSimulation()
        with open(filename, 'r') as file_object:
            for line in file_object:
                if line.startswith('% frame'):
                    frame_data = ""
                    line = file_object.readline()
                    while not line.startswith('% end'):
                        frame_data+=line
                        line = file_object.readline()
                    frame_data = io.StringIO(frame_data)
                    sim.add_frame(self.parse_frame(frame_data))
        return sim



[docs]
    def parse_frame(self,frame_data):
        """ Parses the cytosim trajectory file """
        data_list = [] # list to store the dataframe object
        time = 0

        def _generate_data_pattern(*args):
            """
            Deprecated
            generates a compiled regex to parse data
            such that match.group(arg) returns the value
            of the corresponding arg
            """
            arg_string = r''
            for arg in args:
                arg_string += r'\s*'
                arg_string += r'(?P<' + arg + r'>\S+)'
            arg_string += r'\n'
            return re.compile(arg_string)

        # define the regex patterns
        data_pattern = None # read from file
        report_keyword = None
        has_headers = True
        column_headers = []
        time_pattern = re.compile(r'% time\s+(?P<time>\d+.\d+e?\d+)')

        report_typeII  = {'fiber', 'fiber:confine_force', 'fiber:position'}
        report_typeIII = {'fiber:distribution'}
        report_unsupported = {'fiber:speckle', 'bead:singles'}

        indexLine = 0
        for line in frame_data:
            if match := time_pattern.search(line): # obtain frame time
                time = float(match.group('time'))

            if data_pattern is None and has_headers:
                # Deprecated
                # We don't know where the indexLine is
                # so we use some heuristics to decide where it
                # might be and trigger this line of code after finding it
                if indexLine == 1:
                    column_headers = line.split()
                    column_headers.remove(r'%')
                    data_pattern = _generate_data_pattern(*column_headers)
                    print(f"Generated data pattern: {data_pattern}")
                    indexLine = 0

                # To-Do: This is very fragile: idea is to trigger a data pattern
                # search for the line after current line
                if indexLine == 2:
                    # skip this line entirely
                    indexLine = 1
                    continue

                if line.startswith('% report'):
                    report_keyword = line.split()[2]
                    # the line following this line typically contains
                    # the data arguments (posX,posY etc.)
                    # trigger data pattern search in the next line
                    # file structure:
                    #  09  % report bead:position
                    #  10  % class   posX   posY   posZ ...
                    indexLine = 1

                if report_keyword in report_typeII:
                    # Some files have an additional comment below the % report line
                    # handle some of those cases
                    # trigger data pattern search two lines later
                    # file structure:
                    #  09  % report fiber:position
                    #  10  % some comment
                    #  11  % class   posX   posY   posZ ...
                    indexLine = 2

                if report_keyword in report_typeIII:
                    # Some report files simply do not have headers
                    # give up on generating column headers
                    # file structure
                    #  09  % report fiber:distribution
                    #  10  bin     1  2  3  4  5 ...
                    #  11  count   4  5  3  3  2 ...
                    has_headers = False

                if report_keyword in report_unsupported:
                    raise NotImplementedError(f"Report type {report_keyword} is not supported yet.")

            # extract data if the line is not a comment
            elif not line.startswith('%'):
                if not line.isspace():
                    data_vals = line.split()
                    # Convert potential numbers in data_vals to appropriate types
                    for i, val in enumerate(data_vals):
                        try:
                            # Try integer conversion first
                            if ('.' not in val) and ('e' not in val) and ('E' not in val):
                                data_vals[i] = int(val)
                            else:
                                data_vals[i] = float(val)
                        except ValueError:
                            # Keep as string if conversion fails
                            pass
                    data_list.append(data_vals)

        if report_keyword is None:
            warnings.warn("Could not find report keyword in frame data. The file may be malformed.", UserWarning)
        if len(data_list) == 0:
            warnings.warn("No data found in frame. The file may be empty or malformed.", UserWarning)
        
        # convert data_dict into a CSMFrame object
        frame_data = pd.DataFrame(data_list)
        if column_headers and len(frame_data.columns) == len(column_headers):
            frame_data.columns = column_headers
            

        frame = CSMFrame(frame_data)
        # append csmframe attributes
        frame.time = time
        return frame





[docs]
class CYMParser:
    """
    Deprecated
    A Class for parsin the configuration file (configuration.cym)
    Returns a dictionary that can be used in a CSMSimulation
    """

[docs]
    def parse_config(config_file):
        """ Parses configuration file using regex to get params """
        param_dictionary = {}
        # construct regex dictionaries
        # parameters in rx_dict are automatically added to the simulation parameters
        rx_dict = {
                'motor_count':re.compile(r'new (?P<motor_count>\d+) couple motor'),
                'cell_radius':re.compile(r' +geometry = circle (?P<cell_radius>\d+)'),
                'crosslinker_count':re.compile(r'new (?P<crosslinker_count>\d+) couple crosslinker'),
                'filament_count': re.compile(r'new (?P<filament_count>\d+) fiber filament')
                }

        # object_dict is used to obtain parameters that belong to an object, like motors
        object_dict = {
                'plus_motor':re.compile(r'set hand plus_motor'),
                'binder':re.compile(r'set hand binder'),
                'filament' : re.compile(r'(set fiber filament|new \d+ fiber filament)')
                }

        # the object attributes end here
        end_pattern = re.compile(r'}')
        attribute_pattern = re.compile(r' +(?P<attr_name>\S+) = (?P<attr_val>\S+)')

        with open(config_file, 'r') as file:
            for line in file:
                # check if line matches main key
                for key,rx in rx_dict.items():
                    match = rx.match(line)
                    if match:
                        param_dictionary[key] = float(match.group(key))
                        break
                # get properties of objects, if objects are found
                for obj, rx in object_dict.items():
                    if match:= rx.match(line):
                        obj_dict = {}
                        line = file.readline()
                        # read all the attributes within the object specification
                        while not end_pattern.match(line):
                            if match := attribute_pattern.match(line):
                                try: # convert to float if possible
                                    obj_dict[match.group('attr_name')] = float(match.group('attr_val'))
                                except ValueError:
                                    obj_dict[match.group('attr_name')] = match.group('attr_val')
                            line = file.readline()
                        if not obj in param_dictionary:
                            param_dictionary[obj] = obj_dict # add the object dictionary to the parameters
                        else:
                            param_dictionary[obj].update(obj_dict) # update in cases where multiple places provide attributes
                        break

        return param_dictionary