Importing packages and libraries¶
In [ ]:
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import statsmodels
import plotly.io as pio
# import the custom functions for this project
import cytox.core as ctx
display(HTML(data="""
<style>
div#notebook-container { width: 95%; }
div#menubar-container { width: 65%; }
div#maintoolbar-container { width: 99%; }
</style>
"""))
pd.set_option('display.max_rows', None)
pd.set_option('display.max_Columns', None)
Importing and organizing data¶
In [ ]:
# Importing data (30 min incubation)
file_path_30 = (r'..\data\raw_data\measurement1'
r'\single_cell_data.txt')
df_30min = pd.read_csv(file_path_30, sep='\t', skiprows=9)
# in this file, instead of skipping 8 lines, I skipped 9
file_path_16 = (r'..\data\raw_data\measurement2'
r'\single_cell_data.txt')
df_16h = pd.read_csv(file_path_16, sep='\t', skiprows=9)
# in this file, instead of skipping 8 lines, I skipped 9
df_30min.loc[:, "Incubation"] = "30 min"
df_16h.loc[:, "Incubation"] = "16 h"
In [4]:
df_30min.head()
Out[4]:
| Row | Column | Timepoint | Field | Object No | X | Y | Bounding Box | Position X [µm] | Position Y [µm] | Compound | Concentration | Cell Type | Cell Count | Nuclei - Nucleus Area [µm²] | Nuclei - Nucleus Roundness | Nuclei - Intensity Nucleus Alexa 488 Mean | Nuclei - Intensity Nucleus Alexa 488 Median | Nuclei - Intensity Nucleus Alexa 568 Mean | Nuclei - Intensity Nucleus Alexa 568 Median | Nuclei - Green nuclei | Nuclei - Red nuclei | Nuclei - Red and Green nuclei | Unnamed: 23 | Incubation | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2 | 2 | 1 | 1 | 1 | 299 | 13 | [281,0,329,27] | -68.20 | 153.28 | NaN | NaN | NaN | NaN | 82.9212 | 0.738890 | 558.311 | 535 | 715.520 | 601 | 1 | 1 | 1 | NaN | 30 min |
| 1 | 2 | 2 | 1 | 1 | 2 | 789 | 13 | [773,0,825,27] | 75.13 | 153.38 | NaN | NaN | NaN | NaN | 95.2496 | 0.774948 | 391.678 | 381 | 284.929 | 278 | 0 | 0 | 0 | NaN | 30 min |
| 2 | 2 | 2 | 1 | 1 | 3 | 864 | 15 | [841,0,880,30] | 93.03 | 152.63 | NaN | NaN | NaN | NaN | 81.4857 | 0.899973 | 423.839 | 404 | 307.135 | 292 | 0 | 0 | 0 | NaN | 30 min |
| 3 | 2 | 2 | 1 | 1 | 4 | 505 | 95 | [468,60,543,129] | -9.67 | 129.35 | NaN | NaN | NaN | NaN | 341.3960 | 0.979878 | 535.545 | 527 | 341.536 | 337 | 1 | 0 | 0 | NaN | 30 min |
| 4 | 2 | 2 | 1 | 1 | 5 | 840 | 89 | [826,61,855,114] | 87.14 | 131.58 | NaN | NaN | NaN | NaN | 86.8055 | 0.701301 | 442.963 | 427 | 318.589 | 309 | 0 | 0 | 0 | NaN | 30 min |
In [5]:
df_16h.head()
Out[5]:
| Row | Column | Timepoint | Field | Object No | X | Y | Bounding Box | Position X [µm] | Position Y [µm] | Compound | Concentration | Cell Type | Cell Count | Nuclei - Nucleus Area [µm²] | Nuclei - Nucleus Roundness | Nuclei - Intensity Nucleus Alexa 488 Mean | Nuclei - Intensity Nucleus Alexa 488 Median | Nuclei - Intensity Nucleus Alexa 568 Mean | Nuclei - Intensity Nucleus Alexa 568 Median | Nuclei - Green nuclei | Nuclei - Red nuclei | Nuclei - Red and Green nuclei | Unnamed: 23 | Incubation | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2 | 8 | 1 | 1 | 1 | 116 | 16 | [93,0,155,34] | -120.59 | 152.50 | NaN | NaN | NaN | NaN | 144.5330 | 0.818041 | 358.428 | 351 | 231.344 | 229 | 0 | 0 | 0 | NaN | 16 h |
| 1 | 2 | 8 | 1 | 1 | 2 | 268 | 22 | [246,0,309,44] | -76.33 | 150.84 | NaN | NaN | NaN | NaN | 190.9080 | 0.902280 | 481.173 | 470 | 315.513 | 300 | 0 | 0 | 0 | NaN | 16 h |
| 2 | 2 | 8 | 1 | 1 | 3 | 401 | 13 | [363,0,422,27] | -42.37 | 153.37 | NaN | NaN | NaN | NaN | 112.5180 | 0.749322 | 455.647 | 449 | 315.590 | 301 | 0 | 0 | 0 | NaN | 16 h |
| 3 | 2 | 8 | 1 | 1 | 4 | 549 | 14 | [519,0,560,31] | 0.76 | 151.97 | NaN | NaN | NaN | NaN | 72.0552 | 0.753797 | 484.098 | 476 | 332.686 | 329 | 0 | 0 | 0 | NaN | 16 h |
| 4 | 2 | 8 | 1 | 1 | 5 | 734 | 21 | [711,0,770,42] | 58.09 | 151.32 | NaN | NaN | NaN | NaN | 153.8250 | 0.880566 | 406.104 | 399 | 307.142 | 300 | 0 | 0 | 0 | NaN | 16 h |
In [ ]:
# Combining the two dataframes into one dataframe and modifying the columns
df = pd.concat([df_30min, df_16h])
# Map the numbering of the rows to actual alphabet as seen on the plate.
ctx.map_num_to_letter(df, col='Row', inplace=True)
df.index = df.index + 1
# Convert int to str: It is necessary to convert integer to string before
# combining several columns into one column.
df.Column = df.Column.astype(str)
# This is for combining two columns. For separating the data of columns, add
# the desired character (i.e underline, dot. ...) between '' in the script
# (before .join) Order of the columns can be changed to get the new desired
# format.
df['Plate format'] = df[['Row', 'Column']].apply(lambda x: ''.join(x), axis=1)
# Change the order of the columns as desired and remove the unwanted columns.
df = df[['Plate format',
'Incubation',
'Nuclei - Intensity Nucleus Alexa 488 Mean',
'Nuclei - Intensity Nucleus Alexa 488 Median',
'Nuclei - Intensity Nucleus Alexa 568 Mean',
'Nuclei - Intensity Nucleus Alexa 568 Median',
'Nuclei - Green nuclei',
'Nuclei - Red nuclei',
'Nuclei - Red and Green nuclei',
]]
Defining controls and samples¶
In [ ]:
"""Combining the wells of the same samples"""
df = df.copy()
# The naming of the sample is as
# NameOfCompount_Concentration_DurationOfIncubation.
#--------------30 min compound incubation----------------------------
df.loc[df['Plate format'].isin(
ctx.well_ID_generator("B", "B", 2, 3)), 'Sample'] = "Staurosporine_1uM_6h"
df.loc[df['Plate format'].isin(
ctx.well_ID_generator("B", "B", 4, 5)), 'Sample'] = "Staurosporine_0.3uM_6h"
df.loc[df['Plate format'].isin(
ctx.well_ID_generator("B", "B", 6, 7)), 'Sample'] = "Staurosporine_0.1uM_6h"
df.loc[df['Plate format'].isin(
ctx.well_ID_generator("C", "C", 2, 3)), 'Sample'] = "Saponin_9uM_6h"
df.loc[df['Plate format'].isin(
ctx.well_ID_generator("C", "C", 4, 5)), 'Sample'] = "Saponin_3uM_6h"
df.loc[df['Plate format'].isin(
ctx.well_ID_generator("C", "C", 6, 7)), 'Sample'] = "Saponin_1uM_6h"
df.loc[df['Plate format'].isin(
ctx.well_ID_generator("D", "D", 2, 3)), 'Sample'] = "Verteporfin_6uM_30min"
df.loc[df['Plate format'].isin(
ctx.well_ID_generator("D", "D", 4, 5)), 'Sample'] = "Verteporfin_2uM_30min"
df.loc[df['Plate format'].isin(
ctx.well_ID_generator("D", "D", 6, 7)),
'Sample'] = "Verteporfin_0.6uM_30min"
df.loc[df['Plate format'].isin(
ctx.well_ID_generator("E", "E", 2, 3)), 'Sample'] = "Digoxin_6uM_30min"
df.loc[df['Plate format'].isin(
ctx.well_ID_generator("E", "E", 4, 5)), 'Sample'] = "Digoxin_2uM_30min"
df.loc[df['Plate format'].isin(
ctx.well_ID_generator("E", "E", 6, 7)), 'Sample'] = "Digoxin_0.6uM_30min"
df.loc[df['Plate format'].isin(
ctx.well_ID_generator("F", "F", 2, 3)), 'Sample'] = "Saponin_9uM_30min"
df.loc[df['Plate format'].isin(
ctx.well_ID_generator("F", "F", 4, 5)), 'Sample'] = "Saponin_3uM_30min"
df.loc[df['Plate format'].isin(
ctx.well_ID_generator("F", "F", 6, 7)), 'Sample'] = "Saponin_1uM_30min"
df.loc[df['Plate format'].isin(
ctx.well_ID_generator("G", "G", 2, 3)), 'Sample'] = 'DMSO_100%_30min'
df.loc[df['Plate format'].isin(
ctx.well_ID_generator("G", "G", 4, 5)), 'Sample'] = 'DMSO_30%_30min'
df.loc[df['Plate format'].isin(
ctx.well_ID_generator("G", "G", 6, 7)), 'Sample'] = 'DMSO_10%_30min'
#--------------16h compound incubation----------------------------
# We only need Staurosporin and Saponin for proving the experiment works and
# there is no need to include in both experiments with different duration of
# incubation.
df.loc[df['Plate format'].isin(
ctx.well_ID_generator("D", "D", 8, 9)), 'Sample'] = "Verteporfin_6uM_16h"
df.loc[df['Plate format'].isin(
ctx.well_ID_generator("E", "E", 8, 9)), 'Sample'] = "Verteporfin_2uM_16h"
df.loc[df['Plate format'].isin(
ctx.well_ID_generator("F", "F", 8, 9)),
'Sample'] = "Verteporfin_0.6uM_16h"
df.loc[df['Plate format'].isin(
ctx.well_ID_generator("D", "D", 10, 11)), 'Sample'] = "Digoxin_6uM_16h"
df.loc[df['Plate format'].isin(
ctx.well_ID_generator("E", "E", 10, 11)), 'Sample'] = "Digoxin_2uM_16h"
df.loc[df['Plate format'].isin(
ctx.well_ID_generator("F", "F", 10, 11)), 'Sample'] = "Digoxin_0.6uM_16h"
df.loc[df['Plate format'].isin(["B8", "C8"]), 'Sample'] = 'DMSO_100%_16h'
df.loc[df['Plate format'].isin(["G8", "G9"]), 'Sample'] = 'DMSO_30%_16h'
df.loc[df['Plate format'].isin(["G10"]), 'Sample'] = 'DMSO_10%_16h'
In [ ]:
"""Add also a hypothetical well,"G11", which is an empty well just for the
sake of figures, otherwise cannot draw the graphs 6 by 6."""
# Create a dictionary for the new row
new_row = {"Plate format": "G11",
"Sample": "DMSO_10%_16h",
"Incubation": "16 h"}
# Fill the remaining columns with 0
for col in df.columns:
if col not in new_row:
new_row[col] = 0
# Convert the dictionary to a DataFrame and concatenate
new_row_df = pd.DataFrame([new_row])
df = pd.concat([df, new_row_df], ignore_index=True)
In [ ]:
# Remove the rows with NaN in Sample, because those wells are measured but had
# no sample in them.
df = df.dropna(subset=["Sample"])
# If need to check the nan values, use this line:
# df = df.loc[df["COL_NAME"].isnull()]
In [18]:
df.head(2)
Out[18]:
| Plate format | Incubation | Nuclei - Intensity Nucleus Alexa 488 Mean | Nuclei - Intensity Nucleus Alexa 488 Median | Nuclei - Intensity Nucleus Alexa 568 Mean | Nuclei - Intensity Nucleus Alexa 568 Median | Nuclei - Green nuclei | Nuclei - Red nuclei | Nuclei - Red and Green nuclei | Sample | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | B2 | 30 min | 558.311 | 535 | 715.520 | 601 | 1 | 1 | 1 | Staurosporine_1uM_6h |
| 1 | B2 | 30 min | 391.678 | 381 | 284.929 | 278 | 0 | 0 | 0 | Staurosporine_1uM_6h |
Making separate dataframe¶
- Firstly, both experiments were processed in one dataframe for the convenice. Now, we are separating the data to
the two, in order to be able to define threshold and order the samples for generating graphs.
In [19]:
df["Incubation"].unique()
Out[19]:
array(['30 min', '16 h'], dtype=object)
In [20]:
df_30min = df.loc[df["Incubation"] == "30 min"]
df_16h = df.loc[df["Incubation"] == "16 h"]
In [ ]:
# Re-order the samples in the df in the desired way that want to be
# shown on plots later.
samples_order_30min = [
"DMSO_10%_30min", "DMSO_30%_30min", "DMSO_100%_30min",
"Staurosporine_0.1uM_6h", "Staurosporine_0.3uM_6h",
"Staurosporine_1uM_6h", "Saponin_1uM_6h", "Saponin_3uM_6h",
"Saponin_9uM_6h", "Saponin_1uM_30min", "Saponin_3uM_30min",
"Saponin_9uM_30min", "Verteporfin_0.6uM_30min",
"Verteporfin_2uM_30min", "Verteporfin_6uM_30min",
"Digoxin_0.6uM_30min", "Digoxin_2uM_30min", "Digoxin_6uM_30min"
]
samples_order_16h = [
"DMSO_10%_16h", "DMSO_30%_16h", "DMSO_100%_16h",
"Verteporfin_0.6uM_16h", "Verteporfin_2uM_16h",
"Verteporfin_6uM_16h", "Digoxin_0.6uM_16h", "Digoxin_2uM_16h",
"Digoxin_6uM_16h"
]
df_30min = df_30min.copy()
df_16h = df_16h.copy()
df_30min = ctx.sample_order_sorter(
df_30min, samples_order_30min, "Sample"
)
df_16h = ctx.sample_order_sorter(
df_16h, samples_order_16h, "Sample"
)
Export the data to csv¶
In [ ]:
df_30min.to_csv(r'..\data\cleaned_data\df_30min.csv', index = False)
df_16h.to_csv(r'..\data\cleaned_data\df_16h.csv', index = False)