Importing packages and libraries¶

In [ ]:
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import statsmodels 
import plotly.io as pio 

# import the custom functions for this project
import cytox.core as ctx

display(HTML(data="""
<style>
    div#notebook-container    { width: 95%; }
    div#menubar-container     { width: 65%; }
    div#maintoolbar-container { width: 99%; }
</style>
"""))

pd.set_option('display.max_rows', None)
pd.set_option('display.max_Columns', None)

Importing and organizing data¶

In [ ]:
# Importing data (30 min incubation)
file_path_30 = (r'..\data\raw_data\measurement1'
                r'\single_cell_data.txt')
df_30min = pd.read_csv(file_path_30, sep='\t', skiprows=9)
# in this file, instead of skipping 8 lines, I skipped 9

file_path_16 = (r'..\data\raw_data\measurement2'
                r'\single_cell_data.txt')
df_16h = pd.read_csv(file_path_16, sep='\t', skiprows=9)
# in this file, instead of skipping 8 lines, I skipped 9

df_30min.loc[:, "Incubation"] = "30 min"
df_16h.loc[:, "Incubation"] = "16 h"
In [4]:
df_30min.head()
Out[4]:
Row Column Timepoint Field Object No X Y Bounding Box Position X [µm] Position Y [µm] Compound Concentration Cell Type Cell Count Nuclei - Nucleus Area [µm²] Nuclei - Nucleus Roundness Nuclei - Intensity Nucleus Alexa 488 Mean Nuclei - Intensity Nucleus Alexa 488 Median Nuclei - Intensity Nucleus Alexa 568 Mean Nuclei - Intensity Nucleus Alexa 568 Median Nuclei - Green nuclei Nuclei - Red nuclei Nuclei - Red and Green nuclei Unnamed: 23 Incubation
0 2 2 1 1 1 299 13 [281,0,329,27] -68.20 153.28 NaN NaN NaN NaN 82.9212 0.738890 558.311 535 715.520 601 1 1 1 NaN 30 min
1 2 2 1 1 2 789 13 [773,0,825,27] 75.13 153.38 NaN NaN NaN NaN 95.2496 0.774948 391.678 381 284.929 278 0 0 0 NaN 30 min
2 2 2 1 1 3 864 15 [841,0,880,30] 93.03 152.63 NaN NaN NaN NaN 81.4857 0.899973 423.839 404 307.135 292 0 0 0 NaN 30 min
3 2 2 1 1 4 505 95 [468,60,543,129] -9.67 129.35 NaN NaN NaN NaN 341.3960 0.979878 535.545 527 341.536 337 1 0 0 NaN 30 min
4 2 2 1 1 5 840 89 [826,61,855,114] 87.14 131.58 NaN NaN NaN NaN 86.8055 0.701301 442.963 427 318.589 309 0 0 0 NaN 30 min
In [5]:
df_16h.head()
Out[5]:
Row Column Timepoint Field Object No X Y Bounding Box Position X [µm] Position Y [µm] Compound Concentration Cell Type Cell Count Nuclei - Nucleus Area [µm²] Nuclei - Nucleus Roundness Nuclei - Intensity Nucleus Alexa 488 Mean Nuclei - Intensity Nucleus Alexa 488 Median Nuclei - Intensity Nucleus Alexa 568 Mean Nuclei - Intensity Nucleus Alexa 568 Median Nuclei - Green nuclei Nuclei - Red nuclei Nuclei - Red and Green nuclei Unnamed: 23 Incubation
0 2 8 1 1 1 116 16 [93,0,155,34] -120.59 152.50 NaN NaN NaN NaN 144.5330 0.818041 358.428 351 231.344 229 0 0 0 NaN 16 h
1 2 8 1 1 2 268 22 [246,0,309,44] -76.33 150.84 NaN NaN NaN NaN 190.9080 0.902280 481.173 470 315.513 300 0 0 0 NaN 16 h
2 2 8 1 1 3 401 13 [363,0,422,27] -42.37 153.37 NaN NaN NaN NaN 112.5180 0.749322 455.647 449 315.590 301 0 0 0 NaN 16 h
3 2 8 1 1 4 549 14 [519,0,560,31] 0.76 151.97 NaN NaN NaN NaN 72.0552 0.753797 484.098 476 332.686 329 0 0 0 NaN 16 h
4 2 8 1 1 5 734 21 [711,0,770,42] 58.09 151.32 NaN NaN NaN NaN 153.8250 0.880566 406.104 399 307.142 300 0 0 0 NaN 16 h
In [ ]:
# Combining the two dataframes into one dataframe and modifying the columns
df = pd.concat([df_30min, df_16h])

# Map the numbering of the rows to actual alphabet as seen on the plate. 
ctx.map_num_to_letter(df, col='Row', inplace=True)

df.index = df.index + 1

# Convert int to str: It is necessary to convert integer to string before 
# combining several columns into one column.
df.Column = df.Column.astype(str) 

# This is for combining two columns. For separating the data of columns, add 
# the desired character (i.e underline, dot. ...) between '' in the script 
# (before .join) Order of the columns can be changed to get the new desired 
# format.
df['Plate format'] = df[['Row', 'Column']].apply(lambda x: ''.join(x), axis=1)

# Change the order of the columns as desired and remove the unwanted columns.
df = df[['Plate format', 
 'Incubation',
 'Nuclei - Intensity Nucleus Alexa 488 Mean',
 'Nuclei - Intensity Nucleus Alexa 488 Median',
 'Nuclei - Intensity Nucleus Alexa 568 Mean',
 'Nuclei - Intensity Nucleus Alexa 568 Median',
 'Nuclei - Green nuclei',
 'Nuclei - Red nuclei',
 'Nuclei - Red and Green nuclei',
]]

Defining controls and samples¶

In [ ]:
"""Combining the wells of the same samples"""
df = df.copy()

# The naming of the sample is as 
# NameOfCompount_Concentration_DurationOfIncubation.

#--------------30 min compound incubation----------------------------
df.loc[df['Plate format'].isin(
    ctx.well_ID_generator("B", "B", 2, 3)), 'Sample'] = "Staurosporine_1uM_6h"
df.loc[df['Plate format'].isin(
    ctx.well_ID_generator("B", "B", 4, 5)), 'Sample'] = "Staurosporine_0.3uM_6h"
df.loc[df['Plate format'].isin(
    ctx.well_ID_generator("B", "B", 6, 7)), 'Sample'] = "Staurosporine_0.1uM_6h"

df.loc[df['Plate format'].isin(
    ctx.well_ID_generator("C", "C", 2, 3)), 'Sample'] = "Saponin_9uM_6h"
df.loc[df['Plate format'].isin(
    ctx.well_ID_generator("C", "C", 4, 5)), 'Sample'] = "Saponin_3uM_6h"
df.loc[df['Plate format'].isin(
    ctx.well_ID_generator("C", "C", 6, 7)), 'Sample'] = "Saponin_1uM_6h"

df.loc[df['Plate format'].isin(
    ctx.well_ID_generator("D", "D", 2, 3)), 'Sample'] = "Verteporfin_6uM_30min"
df.loc[df['Plate format'].isin(
    ctx.well_ID_generator("D", "D", 4, 5)), 'Sample'] = "Verteporfin_2uM_30min"
df.loc[df['Plate format'].isin(
    ctx.well_ID_generator("D", "D", 6, 7)), 
    'Sample'] = "Verteporfin_0.6uM_30min"

df.loc[df['Plate format'].isin(
    ctx.well_ID_generator("E", "E", 2, 3)), 'Sample'] = "Digoxin_6uM_30min"
df.loc[df['Plate format'].isin(
    ctx.well_ID_generator("E", "E", 4, 5)), 'Sample'] = "Digoxin_2uM_30min"
df.loc[df['Plate format'].isin(
    ctx.well_ID_generator("E", "E", 6, 7)), 'Sample'] = "Digoxin_0.6uM_30min"

df.loc[df['Plate format'].isin(
    ctx.well_ID_generator("F", "F", 2, 3)), 'Sample'] = "Saponin_9uM_30min"
df.loc[df['Plate format'].isin(
    ctx.well_ID_generator("F", "F", 4, 5)), 'Sample'] = "Saponin_3uM_30min"
df.loc[df['Plate format'].isin(
    ctx.well_ID_generator("F", "F", 6, 7)), 'Sample'] = "Saponin_1uM_30min"

df.loc[df['Plate format'].isin(
    ctx.well_ID_generator("G", "G", 2, 3)), 'Sample'] = 'DMSO_100%_30min'
df.loc[df['Plate format'].isin(
    ctx.well_ID_generator("G", "G", 4, 5)), 'Sample'] = 'DMSO_30%_30min'
df.loc[df['Plate format'].isin(
    ctx.well_ID_generator("G", "G", 6, 7)), 'Sample'] = 'DMSO_10%_30min'

#--------------16h compound incubation----------------------------
# We only need Staurosporin and Saponin for proving the experiment works and 
# there is no need to include in both experiments with different duration of 
# incubation. 

df.loc[df['Plate format'].isin(
    ctx.well_ID_generator("D", "D", 8, 9)), 'Sample'] = "Verteporfin_6uM_16h"
df.loc[df['Plate format'].isin(
    ctx.well_ID_generator("E", "E", 8, 9)), 'Sample'] = "Verteporfin_2uM_16h"
df.loc[df['Plate format'].isin(
    ctx.well_ID_generator("F", "F", 8, 9)), 
    'Sample'] = "Verteporfin_0.6uM_16h"


df.loc[df['Plate format'].isin(
    ctx.well_ID_generator("D", "D", 10, 11)), 'Sample'] = "Digoxin_6uM_16h"
df.loc[df['Plate format'].isin(
    ctx.well_ID_generator("E", "E", 10, 11)), 'Sample'] = "Digoxin_2uM_16h"
df.loc[df['Plate format'].isin(
    ctx.well_ID_generator("F", "F", 10, 11)), 'Sample'] = "Digoxin_0.6uM_16h"

df.loc[df['Plate format'].isin(["B8", "C8"]), 'Sample'] = 'DMSO_100%_16h'
df.loc[df['Plate format'].isin(["G8", "G9"]), 'Sample'] = 'DMSO_30%_16h'
df.loc[df['Plate format'].isin(["G10"]), 'Sample'] = 'DMSO_10%_16h'
In [ ]:
"""Add also a hypothetical well,"G11", which is an empty well just for the 
sake of figures, otherwise cannot draw the graphs 6 by 6."""
# Create a dictionary for the new row
new_row = {"Plate format": "G11", 
           "Sample": "DMSO_10%_16h", 
           "Incubation": "16 h"}

# Fill the remaining columns with 0
for col in df.columns:
    if col not in new_row:
        new_row[col] = 0

# Convert the dictionary to a DataFrame and concatenate
new_row_df = pd.DataFrame([new_row])
df = pd.concat([df, new_row_df], ignore_index=True)
In [ ]:
# Remove the rows with NaN in Sample, because those wells are measured but had 
# no sample in them. 
df = df.dropna(subset=["Sample"])
# If need to check the nan values, use this line:
# df = df.loc[df["COL_NAME"].isnull()]
In [18]:
df.head(2)
Out[18]:
Plate format Incubation Nuclei - Intensity Nucleus Alexa 488 Mean Nuclei - Intensity Nucleus Alexa 488 Median Nuclei - Intensity Nucleus Alexa 568 Mean Nuclei - Intensity Nucleus Alexa 568 Median Nuclei - Green nuclei Nuclei - Red nuclei Nuclei - Red and Green nuclei Sample
0 B2 30 min 558.311 535 715.520 601 1 1 1 Staurosporine_1uM_6h
1 B2 30 min 391.678 381 284.929 278 0 0 0 Staurosporine_1uM_6h

Making separate dataframe¶

  • Firstly, both experiments were processed in one dataframe for the convenice. Now, we are separating the data to

the two, in order to be able to define threshold and order the samples for generating graphs.

In [19]:
df["Incubation"].unique()
Out[19]:
array(['30 min', '16 h'], dtype=object)
In [20]:
df_30min = df.loc[df["Incubation"] == "30 min"]
df_16h = df.loc[df["Incubation"] == "16 h"]
In [ ]:
# Re-order the samples in the df in the desired way that want to be 
# shown on plots later. 
samples_order_30min = [
    "DMSO_10%_30min", "DMSO_30%_30min", "DMSO_100%_30min", 
    "Staurosporine_0.1uM_6h", "Staurosporine_0.3uM_6h", 
    "Staurosporine_1uM_6h", "Saponin_1uM_6h", "Saponin_3uM_6h", 
    "Saponin_9uM_6h", "Saponin_1uM_30min", "Saponin_3uM_30min", 
    "Saponin_9uM_30min", "Verteporfin_0.6uM_30min", 
    "Verteporfin_2uM_30min", "Verteporfin_6uM_30min", 
    "Digoxin_0.6uM_30min", "Digoxin_2uM_30min", "Digoxin_6uM_30min"
]

samples_order_16h = [
    "DMSO_10%_16h", "DMSO_30%_16h", "DMSO_100%_16h",
    "Verteporfin_0.6uM_16h", "Verteporfin_2uM_16h", 
    "Verteporfin_6uM_16h", "Digoxin_0.6uM_16h", "Digoxin_2uM_16h", 
    "Digoxin_6uM_16h"
]

df_30min = df_30min.copy()
df_16h = df_16h.copy()

df_30min = ctx.sample_order_sorter(
    df_30min, samples_order_30min, "Sample"
)
df_16h = ctx.sample_order_sorter(
    df_16h, samples_order_16h, "Sample"
)

Export the data to csv¶

In [ ]:
df_30min.to_csv(r'..\data\cleaned_data\df_30min.csv', index = False)
df_16h.to_csv(r'..\data\cleaned_data\df_16h.csv', index = False)