Python script to convert from csv/xlsx to md/html #126

rdutta1999 · 2022-06-09T10:06:00Z

The idea was to convert the Freecam and Tool archive spreadsheet into a markdown file.

The code snippet:

import re
import os
import argparse
import pandas as pd

input_formats_supported = ["csv", "xlsx"]
output_formats_supported = ["md", "html"]

def read_file(file_name):
    """
    This function reads a file and returns a pandas dataframe
    """
    if not os.path.exists(file_name):
        raise Exception('Input File does not exist.')

    if file_name.endswith('.csv'):
        return pd.read_csv(file_name)
    elif file_name.endswith('.xlsx'):
        return pd.read_excel(file_name)
    else:
        raise Exception('Input File type not supported')


def process_df(df):
    """
    This function is written to parse and clean Nico's Freecam-Tools Spreadsheet. Change it to parse and clean your own data.
    """

    #df = df.head(10)
    # Replace NaN values with empty strings
    df.fillna('', inplace=True)
    
    # Replace new-line characters in each string in the columns with whitespaces
    for col in df.columns:
        df[col] = df[col].str.replace('\n', ' ', regex = True)

    return df


def process_markdown_string(string):
    """
    This function cleans the markdown string.
    """

    # Removing unncessary hyphens used to create the headers
    string = re.sub("-+", "-", string)
    
    # Cleaning whitespaces except newline and carriage return
    string = re.sub("[^\S\r\n]+", " ", string)

    return string


def save_file(df, file_name):
    """
    Saves the dataframe to a file
    """

    if file_name.endswith('.md'):
        string = df.to_markdown(index = False)
        string = process_markdown_string(string)

    elif file_name.endswith('.html'):
        string = df.to_html(index = False, justify = 'center')
    
    else:
        raise Exception('Output File type not supported')

    print("The final String.... \n\n" + string)
    
    with open(file_name, "w", encoding="utf-8", errors="xmlcharrefreplace") as output_file:
        output_file.write(string)


def read_and_convert(input_path, output_path):
    """
    This function reads the input file, processes it and converts it to the output file.
    """

    df = read_file(input_path)
    df = process_df(df)
    save_file(df, output_path)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("input_file", help = "Path of the input file to convert. Supported formats are: " + ", ".join(input_formats_supported))
    parser.add_argument("output_file", help = "Path of the output file. Supported formats are: " + ", ".join(output_formats_supported))

    args = parser.parse_args()

    input_path = args.input_file
    output_path = args.output_file

    input_file_ext = input_path.split(".")[-1]
    output_file_ext = output_path.split(".")[-1]

    if input_file_ext not in input_formats_supported:
        raise Exception("Input file format not supported. Only the following formats are supported: " + ", ".join(input_formats_supported))

    if output_file_ext not in output_formats_supported:
        raise Exception("Output file format not supported. Only the following formats are supported: " + ", ".join(output_formats_supported))

    read_and_convert(input_path, output_path)

I didn't create a new PR since I wasn't sure where (or even if) the code should be placed in the repo.

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Python script to convert from csv/xlsx to md/html #126

Python script to convert from csv/xlsx to md/html #126

rdutta1999 commented Jun 9, 2022

Python script to convert from csv/xlsx to md/html #126

Python script to convert from csv/xlsx to md/html #126

Comments

rdutta1999 commented Jun 9, 2022