Initial commit

e72ff611 · SofiaTorchia · e72ff611 · e72ff611 · e72ff611 · e72ff611
Commit e72ff611 authored 9 months ago by SofiaTorchia
--- a/LICENSE
+++ b/LICENSE
+MIT License
+
+Copyright (c) 2023 Dominik Kutra, Christian Tischer, Matteo Spatuzzi, Jean-Karim Heriche;
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/hiding_analysis.py
+++ b/hiding_analysis.py
+""" 
+
+Module for generating a dataset where for each mouse we compute 
+details about time spent is certain rois for hiding quantification
+    
+Input data structure:
+
+|   |video         |animal |roi      |entry|exit|
+|-  |-             |-      |-        |-    |-   |
+|0  |C57j-B6-US_MS |UM     |Up-corn1 |1214 |1477|
+|1  |C57j-B6-US_MS |UM     |Up-corn1 |1690 |2500|
+|2  |C57j-B6-US_MS |UM     |Up-corn1 |2570 |2598|
+|3  |C57j-B6-US_MS |UM     |Up-corn1 |3688 |3737|
+|4  |C57j-B6-US_MS |UM     |Up-corn1 |7210 |7301|
+|.. |..            |..     |..       |..   |..  |    
+    
+    
+"""
+
+import numpy as np
+import pandas as pd
+
+
+FRAMERATE = 45
+N = 27000    # Total number of frames in a 10-minute recording
+
+
+def delete_hiding_noise(data: pd.DataFrame) -> pd.DataFrame:
+    '''
+        Sometimes, a mouse moving very close to an ROI is incorrectly detected as being
+        inside this ROI for a few frames. The following function deletes those rows where
+        the mouse is considered to be in one ROI for less than one second,
+        aiming to reduce noise in the dataset.
+        
+        Args: 
+            data: dataset containing ROI information for each mouse along with the 
+                  corresponding entry and exit time frames.
+        Returns: 
+            denoised version of the input dataset
+    '''
+    indices = data[data.exit - data.entry <= FRAMERATE].index.values.tolist()
+    data.drop(indices, inplace = True)
+    data.reset_index(inplace = True)
+    return data
+
+
+def hiding_and_unreachable_vectors(data: pd.DataFrame) -> pd.DataFrame:
+    '''
+        This function computes: 
+         - a vector containing 1 for each frame when the mouse is hiding, i.e. 
+         when it is in a corner (corn).
+         - a vector containing 1 for each frame when the mouse is unreachable, 
+         i.e. it is in any high-position ROI (H-..).
+         
+         Args: 
+             data: dataset containing ROI information for each mouse along with the 
+                  corresponding entry and exit time frames.
+        Returns: 
+            hiding: binary data with 1 for frames when the mouse is hiding
+            unreachable: binary data with 1 for frames when the mouse is
+                        unreachable by the other mouse
+            
+    '''
+    hiding = np.zeros(N*11)
+    unreachable = np.zeros(N*11)
+
+    for i in range(data.shape[0]):
+        if 'corn' in data.roi.iloc[i] or 'H' in data.roi.iloc[i]:
+            for j in range(data.entry.iloc[i],data.exit.iloc[i],1):
+                hiding[j] = 1
+                if 'H' in data.roi.iloc[i]:
+                    for h in range(data.entry.iloc[i],data.exit.iloc[i],1):
+                        unreachable[h] = 1
+    return hiding, unreachable
+
+
+
+
+
+
+def time_in_rois(paths: str) -> list[list]:
+    '''
+        Here, for each video, we first apply 'delete_hiding_noise' to filter out
+        the noise related to hiding time.
+        Then, for each mouse belonging to the same recording, we compute: 
+        - the total time of the recording
+        - the total time spent hiding
+        - the portion of time spent in different regions 
+          (other compartment, other nest, other HF)
+        
+        Args:
+            paths: input data path
+        Returns: 
+            vector_list: list of vectors. Each vector contains specific
+                        time measurements for each single mouse
+      
+
+    '''
+
+    vector_list = []
+
+    for path in paths:
+        data = pd.read_csv(path)
+        data = delete_hiding_noise(data)
+        names = data.video.unique()
+
+        for file_name in names:
+            for mouse in ['UM','LM']:
+
+                if mouse == 'UM': 
+                    other_comp = 'Down'
+                else: 
+                    other_comp = 'Up'
+
+                vector = [file_name+'-'+mouse]
+                d1 = data[data.video == file_name]
+                d1 = d1[d1.animal == mouse]
+
+                t1 = (d1[d1.roi == 'Up-comp']['exit'] - d1[d1.roi == 'Up-comp']['entry']).sum()
+                t2 = (d1[d1.roi == 'Down-comp']['exit'] - d1[d1.roi == 'Down-comp']['entry']).sum()
+                total_time1 = t1 + t2
+
+                total_time_hidden1 = 0
+                for roi in ['Up-HF', 'Down-HF', 'Up-HT', 'Down-HT', 'Up-corn1','Up-corn2',
+                            'Up-corn3','Up-corn4','Down-corn1','Down-corn2','Down-corn3',
+                            'Down-corn4' ]:
+                    t3 = (d1[d1.roi == roi]['exit'] - d1[d1.roi == roi]['entry']).sum()
+                    total_time_hidden1 = total_time_hidden1 + t3
+
+                t_exit = d1[d1.roi == other_comp + '-comp']['exit']
+                t_entry = d1[d1.roi == other_comp + '-comp']['entry']
+                total_time_other_comp1 = (t_exit - t_entry).sum()
+
+                t_exit = d1[d1.roi == other_comp + '-nest']['exit']
+                t_entry =  d1[d1.roi == other_comp + '-nest']['entry']
+                total_time_other_nest1 = (t_exit - t_entry).sum()
+
+                t_exit = d1[d1.roi == other_comp + '-HF']['exit']
+                t_entry = d1[d1.roi == other_comp + '-HF']['entry']
+                total_time_other_hf1 = (t_exit - t_entry).sum()
+
+                total_time_hidde_other_comp1 = 0
+                for roi in [other_comp + '-HF', other_comp + '-HT', other_comp + '-corn1',
+                            other_comp + '-corn2', other_comp + '-corn3',other_comp + '-corn4']:
+                    t4 =  (d1[d1.roi == roi]['exit'] - d1[d1.roi == roi]['entry']).sum()
+                    total_time_hidde_other_comp1 = total_time_hidde_other_comp1 + t4
+
+                portion_time_other_comp1 = round(total_time_other_comp1/total_time1,2)
+                portion_time_other_nest_hf1 = total_time_other_nest1 - total_time_other_hf1
+                portion_time_other_nest_hf1 = portion_time_other_nest_hf1 / total_time1
+                portion_time_other_nest_hf1 = round(portion_time_other_nest_hf1,2)
+                portion_time_hidden1 = round(total_time_hidden1/total_time1,2)
+                portion_time_hidde_other_comp1 = round(total_time_hidde_other_comp1/total_time1,2)
+                vector = vector + [portion_time_other_comp1,portion_time_other_nest_hf1,
+                                   portion_time_hidden1,portion_time_hidde_other_comp1]
+                vector_list.append(vector)
+
+    return vector_list
+
+
+
+def traveled_distance(paths: str) -> list[list]:
+    '''    
+        Here, we extract the information about the distance traveled by each mouse
+        
+        Args:
+            paths: input data path
+        Returns: 
+            vectors: list containing the recording and mouse names, 
+                     and the total distance traveled by the mouse 
+                     during the recording.
+    '''
+
+    vectors = []
+
+    for path in paths:
+        data = pd.read_csv(path)
+        names = data.video.unique()
+
+        for file_name in names:
+            for mouse in ['UM','LM']:
+                vector = [file_name+'-'+mouse]
+
+                cond1 = data.video == file_name
+                cond2 = data.animal == mouse
+                cond3 = data.measure == 'Distance (cm)'
+                distance = data[cond1][cond2][cond3].value.values[0]
+                vector.append(distance)
+            vectors.append(vector)
+    return vectors
+
+
+
+def cumulative_time_in_rois(paths: str) -> pd.DataFrame:
+
+    '''
+        In the following snippet, for each video, we first apply 'delete_hiding_noise'
+        to filter out the noise related to hiding time.
+        Then, for each mouse belonging to the same recording, we compute: 
+        - the cumulative time spent hiding
+        - the cumulative time spent in unreachable areas
+
+        For T2 videos, it identifies a restart frame and adjusts the cumulative 
+        sums accordingly.
+        
+        Args:
+            paths: input data path
+        Returns: 
+            dataframe: Each raw of the dataframe contains specific
+                        time measurements for each single mouse
+    '''
+
+    for path in paths:
+        data = pd.read_csv(path)
+        data = delete_hiding_noise(data)
+        names = data.video.unique()
+
+        for file_name in names:
+            for mouse in ['UM','LM']:
+
+                d = data[data.video == file_name]
+                d = d[d.animal == mouse]
+
+                hiding, unreachable = hiding_and_unreachable_vectors(d)
+                cumsum1 = np.cumsum(hiding)
+                cumsum2 = np.cumsum(unreachable)
+
+                time_in_seconds = [round(i/FRAMERATE,4) for i in range(N*11)]
+                dataframe = pd.DataFrame({'frames': list(range(N*11)),
+                                          'time_in_seconds': time_in_seconds,'hiding': hiding, 
+                                          'unreachable': unreachable, 'hiding_cumsum': cumsum1, 
+                                          'unreachable_cumsum': cumsum2})           
+                total_time = (d[(d.roi == 'Up-comp') | (d.roi == 'Down-comp')]['exit'] -
+                              d[(d.roi == 'Up-comp') | (d.roi == 'Down-comp')]['entry']).sum()
+
+                cumsum1_portion = dataframe.hiding_cumsum / total_time
+                cumsum2_portion = dataframe.unreachable_cumsum / total_time
+
+                dataframe['relative_hiding_cumsum'] = cumsum1_portion
+                dataframe['relative_unreachable_cumsum'] = cumsum2_portion
+
+                if '-T1.2' in file_name:
+                    d1 = data[data.video == file_name]
+                    d1 = d1[d1.animal == mouse]
+                    restart_frame = d1.exit.max()
+
+                    d_hiding_cumsum = dataframe.hiding_cumsum[dataframe.frames == restart_frame]
+                    restart_h_cumsum = d_hiding_cumsum[restart_frame]
+
+                    condition = dataframe.frames == restart_frame
+                    d_unreachable_cumsum = dataframe.unreachable_cumsum[condition]
+                    restart_unreachable_cumsum = d_unreachable_cumsum[restart_frame]
+
+                    condition = dataframe.frames == restart_frame
+                    d_restart_hiding = dataframe.relative_hiding_cumsum[condition]
+                    restart_relative_hidden_cumsum = d_restart_hiding[restart_frame]
+
+                    condition = dataframe.frames == restart_frame
+                    d_restart_unreachable = dataframe.relative_unreachable_cumsum[condition]
+                    restart_relative_unreachable_cumsum = d_restart_unreachable[restart_frame]
+
+                    dataframe.drop(list(range(restart_frame + 1,N*9,1)), inplace = True)
+
+                    d_h_subset = dataframe.hiding_cumsum.iloc[restart_frame + 1 :]
+                    dataframe.hiding_cumsum.iloc[restart_frame + 1 :] = d_h_subset.apply(lambda x: x - restart_h_cumsum)
+
+                    d_subset = dataframe.unreachable_cumsum.iloc[restart_frame + 1 :]
+                    dataframe.unreachable_cumsum.iloc[restart_frame + 1 :] = d_subset.apply(lambda x: x - restart_unreachable_cumsum)
+
+                    d_subset = dataframe.relative_hiding_cumsum.iloc[restart_frame + 1 :]
+                    dataframe.relative_hiding_cumsum.iloc[restart_frame + 1 :] = d_subset.apply(lambda x: x - restart_relative_hidden_cumsum)
+
+                    d_subset = dataframe.relative_unreachable_cumsum.iloc[restart_frame + 1 :]
+                    dataframe.relative_unreachable_cumsum.iloc[restart_frame + 1 :] = d_subset.apply(lambda x: x - restart_relative_unreachable_cumsum)
+    return dataframe
--- a/pca_analysis.py
+++ b/pca_analysis.py
+"""
+Module for exploring the following dataset ('pca_dataset.csv')
+via PCA and for plotting 2 dimensional data 
+resulting from the projection into the space generate 
+by the first two principal components:
+
+|  | video          |chases|attacks|other_comp|other_cage|locomotion|ur_postures|flights|hiding|mouse|line |time|interaction|
+|- |-               | -    |-      |-         |-         |-         |-          |-      |-     |-    |-    |-   |-          |
+|0 |CD1-B1-UN_LS-T1 |0.0   |25.0   |9.5       |0.0       |15293.8694|6.0        |10.0   |21.5  |UN   |CD1  |T1  |1          |
+|1 |CD1-B1-UN_LS-T1 |15.0  |32.0   |50.5      |12.0      |12501.2516|0.0        |1.0    |12.5  |LS   |CD1  |T1  |1          |
+|2 |CD1-B1-MS_US-T1 |2.0   |25.0   |41.5      |2.0       |15990.6214|0.0        |0.0    |23.0  |MS   |CD1  |T1  |1          |
+|3 |CD1-B1-MS_US-T1 |6.0   |23.0   |32.0      |1.0       |13421.5625|0.0        |5.0    |22.5  |US   |CD1  |T1  |1          |
+|4 |CD1-B2-UN_LS-T1 |6.0   |38.0   |27.5      |5.5       |15003.1560|4.0        |9.0    |46.5  |UN   |CD1  |T1  |1          |
+|..|...             |...   |...    |...       |...       |...       |...        |...    |...   |...  |...  |... |...        |
+
+"""
+
+import pandas as pd
+import numpy as np
+from sklearn.decomposition import PCA
+import matplotlib.pyplot as plt
+from sklearn.preprocessing import StandardScaler
+
+PATH = 'pca_dataset.csv'
+STRAINS = ['CD1','C57j','Hyb']
+
+
+def create_subset_for_pca(line: str, time: str, interaction: int = None) -> pd.DataFrame:
+    """
+        Creates a subset of the entire dataset on which PCA will be performed.
+        Data are selected based on the mouse strain "line" and the recording
+        period "time".
+        
+        Args: 
+            data: entire dataset from which a subset is extracted
+            line: condition to select a specific strain on mice
+            time: recording time to filter out early/late data
+            interaction: if non None, 1 for selecting only interacting mice, 
+                         0 for non-interacting mice.
+        Returns: 
+            dataset: subset of initial dataset
+            
+    """
+    data = pd.read_csv(PATH)
+    cond_time = data.time == time
+    dataset = data[cond_time]
+    
+    
+    if interaction: 
+        cond_interaction = data.interaction == interaction
+        dataset = dataset[cond_interaction]
+    
+    if line in STRAINS:
+        cond_line = dataset.line == line
+        dataset = dataset[cond_line]
+    return dataset
+
+
+
+def plot_projected_data(new_data: pd.DataFrame, status_column: str=None) -> None:
+    """
+        Plots data points projected into the space defined by the two principal component 
+        and colors point based on their status (dominant or subordinate mouse) or mouse line
+        
+        Args: 
+            new_data: input 2-dimensional dataset with mice as data points and 'pca0', 'pca1', 
+                      'line', status' and 'interaction' as columns.
+                      Mice belonging to the same pair have to appear consecutively in 
+                      the dataset
+            status_columns: dataset columns to be chosen as color criterion
+        Returns: 
+            plot color-coded two dimensional data points
+    """
+    if status_column == 'status_and_interaction':
+        colors = {'dominant': 'purple','subordinate':'orange','no interaction':'yellow'}
+
+
+
+
+        edgecolors = {'dominant': 'indigo','subordinate':'darkorange','no interaction':'gold'}
+    elif status_column == 'line':
+        colors = {'CD1': 'gold','Hyb':'firebrick','C57j':'deepskyblue'}
+        edgecolors = {'CD1': 'goldenrod','Hyb':'maroon','C57j':'dodgerblue'}
+    elif status_column == 'status':
+        colors = {'dominant': 'purple','subordinate':'orange'}
+        edgecolors = {'dominant': 'indigo','subordinate':'darkorange'}
+
+    for key,value in colors.items():
+        cond = new_data[status_column] == key
+        data_plot = new_data[cond]
+        color = value
+        edgecolor = edgecolors[key]
+        plt.scatter(data_plot['pca0'],data_plot['pca1'],color = color,
+                    s = 200, edgecolors = edgecolor, label = key)
+
+    plt.legend()
+    plt.xlabel('First Principal Component', fontsize = 17)
+    plt.ylabel('Second Principal Component', fontsize = 17)
+
+
+def plot_pairs(new_data: pd.DataFrame) -> None:
+    '''
+        Plots a segment between two projected data points if they represent 
+        two mice belonging to the same pair.
+        
+        Args: new_data: input 2-dimensional dataset with mice as data points and 'pca0', 'pca1', 
+              'line', status' and 'interaction' as columns.
+              Mice belonging to the same pair have to appear consecutively in 
+              the dataset
+        Returns: 
+            plots segments between mice belonging to the same pair
+        
+    '''
+    n = new_data.shape[0]
+    for i in range(0,n,2):
+        plt.plot([new_data.iloc[i]['pca0'],new_data.iloc[i+1]['pca0']],
+                 [new_data.iloc[i]['pca1'],new_data.iloc[i+1]['pca1']],
+                 color = 'grey', alpha = 0.1)
+
+    plt.xlabel('First Principal Component', fontsize = 17)
+    plt.ylabel('Second Principal Component', fontsize = 17)
+    
+    
+def plot_loadings(v1: list,v2: list):
+    """
+        Plots pca loadings.
+        
+        Args: 
+            v1: loadings for PC1
+            v2: loadings for PC2
+    """
+    
+    plt.figure(figsize=(5,3))
+    plt.bar([i for i in range(len(v1))],v1,color = 'teal', width = 0.8, alpha = 0.3,label = 'PC1')
+    plt.bar([i for i in range(len(v2))],v2,color = 'brown', width = 0.5, alpha = 0.4,label = 'PC2')
+    plt.vlines([i for i in range(len(v1))],-1,1,color = 'grey', linewidth = 1, alpha = 0.2)
+    plt.hlines(0,-10,10,color = 'grey', linewidth = 1, alpha = 0.2)
+    
+    numerical_columns = ['chases','attacks','other_comp','locomotion',
+                         'ur_postures','flights','hiding']
+    plt.xlim((-0.7,7.7))
+    plt.xticks([i for i in range(len(v1))],numerical_columns,rotation = 45)
+
+    plt.title('Factor loadings for PC1 and PC2')
+    plt.legend()
+    #plt.savefig('Pictures/loadings.svg')
+    plt.show()
+
+    
+    
+    
+try:
+    print('\nPerforming PCA analysis')
+    dataset = create_subset_for_pca('all animals','T2')
+        
+    numerical_columns = ['chases','attacks','other_comp','locomotion',
+                         'ur_postures','flights','hiding']
+    
+    scaler = StandardScaler().set_output(transform = "pandas")
+    scaled_dataset = scaler.fit_transform(dataset[numerical_columns])
+       
+    pca = PCA(n_components=scaled_dataset.shape[1]).set_output(transform="pandas")
+    pca.fit(scaled_dataset)
+    new_data = pca.transform(scaled_dataset)
+    print('explained variance: ', [round(i,3) for i in pca.explained_variance_ratio_])
+    
+    new_data[['video','mouse','line',
+              'time','interaction']] = dataset[['video','mouse','line',
+                                                'time','interaction']]
+    plt.figure(figsize=(7,7))
+    plot_projected_data(new_data,'line')
+    plot_pairs(new_data)
+    plt.xlim((-4.7,6.3))
+    plt.ylim((-2.3,5.5))
+    plt.title('PCA', fontsize = 20)
+    plt.show()
+    
+    
+    videos = list(new_data.video.unique())
+    new_status = []
+    for video in videos:
+        if new_data.pca0[new_data.video == video].values[0] > new_data.pca0[new_data.video == video].values[1]:
+            new_status.extend(['dominant','subordinate'])
+        else: 
+            new_status.extend(['subordinate','dominant'])
+    new_data['status'] = new_status
+    
+    v1 = pca.components_[0,:]
+    v2 = pca.components_[1,:]
+    plot_loadings(v1,v2)
+    
+    print()
+except: 
+    print('Failed')
\ No newline at end of file
--- a/permutations.R
+++ b/permutations.R
+# get arguments from command line
+args = commandArgs(trailingOnly=TRUE)
+
+# test if there is at least one argument: if not, return an error
+if (length(args)==0) {
+  stop("At least one argument must be supplied (input file).n", call.=FALSE)
+} else if (length(args)==1) {
+  # default output file
+  args[2] = "out.txt"
+}
+
+# check if seed was provided, otherwise set it to 21
+if (length(args)>2){
+  seed <- args[3]
+} else{
+  print("Setting seed to 21.")
+  seed <- 21
+}
+
+# check if number of permutations is provided, otherwise set them to 2000
+if (length(args)>3){
+  N <- args[4]
+}else{
+  print("Setting number of permutations to N=2000.")
+  N <- 2000
+}
+
+# reqiured libraries
+library(tidyverse)
+library(readxl)
+
+#### DEFINE FUNCTIONS
+# calculate the average difference
+average_diff <- function(df){
+  df %>% 
+    group_by(pair) %>% 
+    summarize(diff = max(score) - min(score)) %>% 
+    pull(diff) %>% 
+    mean
+}
+
+# get a null distribution for average differences by permutation
+null_mean <- function(df,n=2000){
+  replicate(n,df %>% 
+              mutate(pair = sample(pair)) %>% 
+              average_diff()
+  )
+}
+
+# the permutation test as a whole:
+perm_test <- function(df, n){
+  obs <- average_diff(df)
+  null_dist <- null_mean(df,n)
+  mean(null_dist >= obs)
+}
+
+
+
+# load data
+data <- read_xlsx(args[1])
+data <- data %>% 
+  rename(mouse_line = `Mouse_line`)
+
+# Transform the data into long format 
+data_long <- data %>% 
+  pivot_longer(
+    -c(mouse_line,pair),
+    names_to = "behavior",
+    values_to = "score"
+  )
+
+#  RUN PERMUTATION TESTS for all data in input
+
+# set seed
+set.seed(21)
+
+print("calculating permutations...")
+results <- data_long %>% 
+  filter(!is.na(score)) %>% 
+  group_by(mouse_line, behavior) %>% 
+  summarize(p = perm_test(data.frame(score,pair),n=2000))
+
+print("writing results to output file")
+write_tsv(results,file=args[2])
+
+
+
+
+
--- a/plot_window.py
+++ b/plot_window.py
+"""
+Module for plotting overlapping distance or speed curves extracted from
+windows centered around all flight onsets.
+    
+"""
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+from matplotlib import rcParams
+import window_analysis as w
+from scipy.stats import sem
+
+
+MOUSE1 = 'M1'
+MOUSE2 = 'M2'
+RIGHT = 60
+LEFT = 30
+FRAMES_PER_SECOND = 45
+FRAME_RATE_ADAPT = 9
+PIXEL_TO_CM = 15.4
+MAX_LENGTH = 3000
+
+BEHAVIORS = ['Fl','A','C','U']
+LABELS_PATH = 'Labels.csv'
+
+
+
+
+def plot_behavioral_data(video: str) -> None:
+    '''
+        This function plots overlapping distance and speed traces of one pair of
+        mice for a single recording as long as vertical lines correspondig to
+        behavioral events (attack, flight, chase, upright posture).
+        These vertical lines are colored according to the bevahior
+        with _get_behavior_color(...).
+        Line style is chosen based on the mouse (M1 or M2) with _get_line_style(...)
+
+        Args:
+            video: name of the recording from which distance and speed are extracted
+        Returns: 
+            plot  distance and speed curves
+    '''
+
+    data = w.get_data(video)
+    behavioral_scoring_data = w.get_behavioral_scoring_data(video, BEHAVIORS)
+    beh_list = w.get_scoring_list(BEHAVIORS)
+    dist = w.compute_distance(data)
+    speed_mouse_1 = w.compute_speed(data,'M1')
+    speed_mouse_2 = w.compute_speed(data,'M2')
+
+    step = 3000
+    end = data.shape[0]
+
+    for i in range(0, end, step):
+        plt.figure(figsize=(25, 3))
+
+        for beh in beh_list:
+            color = _get_behavior_color(beh)
+            style = _get_line_style(beh)
+            func = lambda x: 3000 if x == beh else -100
+            plt.plot(behavioral_scoring_data.Default[i:i+step].apply(func),
+                     label=beh, color=color, linestyle = style)
+
+        plt.plot(dist[i:i+step]/PIXEL_TO_CM, label='Mean distance', color='black')
+        plt.plot(speed_mouse_1.loc[i:i+step]/PIXEL_TO_CM,
+                 label='speed mouse 1', color='orange', alpha=1)
+        plt.plot(speed_mouse_2.loc[i:i+step]/PIXEL_TO_CM,
+                 label='speed mouse 2', color='purple', alpha=1)
+
+        plt.ylim((0, 150))
+        plt.xlabel('Frames', fontsize=15)
+        plt.ylabel('Distance (cm) / Speed (cm/s)', fontsize=15)
+        plt.axhline(y=300, color='black', linestyle='--')
+        plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
+        plt.show()
+        
+
+def _get_behavior_color(behavior: str) -> str:
+    '''
+        Returns the color to be used for the vertical lines in
+        plot_behavioral_data(...) based on the behavior the lines represent
+
+        Args:
+            behavior: type of behavior
+        Returns:
+            color name for vertical line
+    '''
+    if 'R' in behavior:
+        color = 'grey'
+    if 'Fl' in behavior:
+        color = 'deepskyblue'
+    if 'A' in behavior:
+        color = 'green'
+    if 'Cl' in behavior:
+        color = 'red'
+    if 'C-' in behavior:
+        color = 'chartreuse'
+    else:
+        color = 'blue'
+    return color
+
+
+def _get_line_style(behavior: str) -> str:
+    '''
+        Returns line style for vertical lines in plot_behavioral_data(..)
+
+        Args:
+            behavior: type of behavior
+        Returns:
+            line style for vertical line
+    '''
+    if 'M1' in behavior:
+        style = 'dotted'
+    else:
+        style = 'solid'
+    return style
+    
+
+    
+def plot_behavioral_data_example(start: int, end: int, video: str) -> None:
+
+    '''
+        This function plots distance and speed traces of one pair of mice for
+        a single recording separately, for a subset of the entire recording,
+        between frame=start and frame = end.
+        Both distance between mice and the pair of speed curves are
+        represented together with the vertical lines correspondig to behavioral
+        events (attack, flight, chase, upright posture).
+        These lines are colored according
+        to the bevahior with _get_behavior_color(...).
+        Line style is chosen based on the mouse (M1 or M2)
+        with _get_line_style(...)
+        
+        Args: 
+            start: start value for x ticks
+            end: end value for x ticks
+            video: name of the recording from which speed and distance
+                are extracted
+                
+        Returns:
+            plot of distance and speed curves
+    '''
+
+    data = w.get_data(video)
+    
+    behavioral_scoring_data = w.get_behavioral_scoring_data(video,BEHAVIORS)
+    beh_list = w.get_scoring_list(BEHAVIORS)
+    dist = w.compute_distance(data)
+    speed_mouse_1 = w.compute_speed(data,'M1')
+    speed_mouse_2 = w.compute_speed(data,'M2')
+    
+    _set_plot_style()
+    plt.figure(figsize=(13, 3))
+    for beh in beh_list:
+        color = _get_behavior_color(beh)
+        style = _get_line_style(beh)
+        func = lambda x: 3000 if x == beh else -100
+        plt.plot(behavioral_scoring_data.Default[start:end].apply(func),
+                 label=beh, color=color, linestyle=style)
+    plt.plot(dist[start:end] / PIXEL_TO_CM, label='Distance', color='red')
+    plt.ylim((0, 80))
+    plt.axhline(y=300, color='black', linestyle='--')
+    _set_plot_labels(start, end, 45)
+    plt.show()
+
+    plt.figure(figsize=(13, 3))
+    for beh in beh_list:
+        color = _get_behavior_color(beh)
+        style = _get_line_style(beh)
+        plt.plot(behavioral_scoring_data.Default[start:end].apply(func),
+                 label=beh, color=color, linestyle=style)
+
+    plt.plot(speed_mouse_1.loc[start:end] / PIXEL_TO_CM,
+             label='speed 1', color='black', alpha=1, linestyle='dotted')
+    plt.plot(speed_mouse_2.loc[start:end] / PIXEL_TO_CM,
+             label='speed 2', color='black', alpha=1)
+    plt.ylim((0, 80))
+    _set_plot_labels(start, end, 45)
+    plt.show()
+    
+
+def _set_plot_style() -> None:
+    '''
+        Sets axes style for plot_behavioral_data_example(...)
+    '''
+    rcParams['axes.spines.bottom'] = True
+    rcParams['axes.spines.left'] = True
+    rcParams['axes.spines.right'] = False
+    rcParams['axes.spines.top'] = False
+    rcParams['axes.edgecolor'] = 'black'
+
+
+def _set_plot_labels(start: int, end: int, frame_rate: int) -> None:
+    '''
+        Sets labels for plot_behavioral_data_example(...)
+        
+        Args: 
+            start: start value for x ticks
+            end: end value for x ticks
+            frame_rate: step between to consecutive x ticks
+    '''
+    plt.xlabel('Time (s)', fontsize=10)
+    plt.ylabel('Distance (cm)', fontsize=10)
+    plt.xticks(ticks=list(range(start, end + 1, frame_rate)),
+               labels=[round(i / frame_rate) for i in range(0, 1351, frame_rate)],
+               fontsize=10)
+    plt.yticks(fontsize=10)
+    plt.xlim((start, end))
+
+
+    
+def plot_window(curve_list: list[pd.Series], curve: pd.Series, title: str) -> None:
+    '''
+        Plots the curves produced in compute_window(..)
+        
+        Args: 
+            curve_list: list of curves to be represented as overlapping 
+                in the same plot
+            curve: single curve to be plotted together with curve_list. 
+                Generally, this curve represents the average curve
+            title: plot title
+    '''
+    if curve_list.shape[0] > 0:
+        plt.plot(curve_list.T, color = 'grey', alpha = 0.1)
+
+        if 'speed' in title:
+            start = 180
+            max_y = 140
+            step = 45
+            plt.vlines(start,0,max_y,color = 'black')
+            plt.ylim((-2,max_y))
+            plt.ylabel('Speed (cm/s)')
+        elif 'distance' in title:
+            start = 180
+            step = 45
+            plt.vlines(start,0,140,color = 'black')
+            plt.ylim((-4,140))
+            plt.ylabel('Distance (cm)')
+        plt.plot(curve, color = 'red')
+
+        plt.xlabel('Time (s)',fontsize = 12)
+        labels = [round((i - start)/45) for i in range(0,curve.shape[0],step)]
+        plt.xticks(ticks = list(range(0,curve.shape[0],step)),
+                   labels = labels)
+
+    else:
+        print("empty curve list")
+    
+    
+    
+    
+    
+try:
+    # plot entire speed and distance trace of a recording
+    print('\nPlot speed and distance over time')
+    plot_behavioral_data('CD1-B1-MS_US-T1.2.csv')
+    
+    # plot traces within one interval
+    video = 'CD1-B6-UN_LS-T1.2.csv'
+    plot_behavioral_data_example(2170, 3520, video)
+    
+    
+    line = 'C57j'
+    time = 'T1'
+    left_length = 180
+    right_length = 180
+
+    curve_to_visualize = 'distance'
+    behavior = 'flights_without_chases'
+    align_criterium = 'max_slope'
+
+    if align_criterium: crit_title = ' -  alignment by ' + align_criterium
+    else: crit_title = ' - no aligment'
+
+    video_list_all = w.get_video_list()
+    video_list = [video for video in video_list_all if time in video and line in video]
+    
+    print('Computing average distance and speed traces around flight onset for all listed recordings...\n')
+    curve_list = w.compute_window(video_list, behavior, left_length, 
+                                  right_length, curve_to_visualize, 
+                                  align_criterium)    
+    
+    mean_curve = np.nanmean(curve_list, axis = 0) 
+
+    plt.figure(figsize = (6,6))
+    plot_window(curve_list, mean_curve, curve_to_visualize)
+    plt.show()
+    
+    plt.figure(figsize = (6,6))   
+    st_error = sem(curve_list)
+    plt.plot(mean_curve)
+    x = [i for i in range(left_length + right_length+1)]
+    plt.fill_between(x,mean_curve+st_error, mean_curve-st_error,alpha = 0.4)
+    plt.show()
+
+except: 
+    print('Failed')
\ No newline at end of file
--- a/proximity.py
+++ b/proximity.py
+"""
+Co-habitation time spent within short distance
+
+In this script, we measure the duration that two mice from the same pair spend in close proximity to each other. "Close" is defined as follows:
+- They must be in the same ROI.
+- The Euclidean distance between them must be less than a predefined threshold, referred to here as the "radius."
+
+
+Input data:
+
+|   |video         |animal |roi      |entry|exit|
+|-  |-             |-      |-        |-    |-   |
+|0  |C57j-B6-US_MS |UM     |Up-corn1 |1214 |1477|
+|1  |C57j-B6-US_MS |UM     |Up-corn1 |1690 |2500|
+|2  |C57j-B6-US_MS |UM     |Up-corn1 |2570 |2598|
+|3  |C57j-B6-US_MS |UM     |Up-corn1 |3688 |3737|
+|4  |C57j-B6-US_MS |UM     |Up-corn1 |7210 |7301|
+|.. |..            |..     |..       |..   |..  |  
+
+
+"""
+
+
+import importlib
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+import window_analysis as d
+
+PIXEL_TO_CM = 15.4
+ROIS = ['Down-HF','Down-comp','Down-corn1','Down-corn2','Down-corn3','Down-corn4','Down-nest','Down-HT',
+        'Up-HF','Up-comp','Up-corn1','Up-corn2','Up-corn3','Up-corn4','Up-nest','Up-HT']
+
+
+
+def delete_hiding_noise(data: pd.DataFrame) -> pd.DataFrame:
+    '''
+        Sometimes, a mouse moving very close to an ROI is incorrectly detected as being
+        inside this ROI for a few frames. The following function deletes those rows where
+        the mouse is considered to be in one ROI for less than one second,
+        aiming to reduce noise in the dataset.
+        
+        Args: 
+            data: dataset containing ROI information for each mouse along with the 
+                  corresponding entry and exit time frames.
+        Returns: 
+            denoised version of the input dataset
+    '''
+    indices = data[data.exit - data.entry <= FRAMERATE].index.values.tolist()
+    data.drop(indices, inplace = True)
+    data.reset_index(inplace = True)
+    return data
+
+
+
+def create_rois_per_frame(data: pd.DataFrame):
+    """
+        Creates a dataset of the following format:
+        |                         | C57j-B6-UN_LS-T1.1-LM | C57j-B6-US_MS-T1.2-LM | C57j-B6-UN_LS-T1.2-UM | ... | CD1-B6-UN_LS-T1.1-LM  | CD1-B6-UN_LS-T1.2-UM  |
+        |-------------------------|-----------------------|-----------------------|-----------------------|-----|-----------------------|-----------------------|
+        | 0                       | Down-corn2            | Down-comp             | Up-comp               | ... | Up-nest               | Up-corn4              |
+        | 1                       | Down-corn2            | Down-comp             | Up-comp               | ... | Up-nest               | Up-corn4              |
+        | 2                       | Down-corn2            | Down-comp             | Up-comp               | ... | Up-nest               | Up-corn4              |
+        | 3                       | Down-corn2            | Down-comp             | Up-comp               | ... | Up-nest               | Up-corn4              |
+        | 4                       | Down-corn2            | Down-comp             | Up-comp               | ... | Up-nest               | Up-corn4              |
+        | ...                     | ...                   | ...                   | ...                   | ... | ...                   | ...                   | 
+    
+        Args:
+            data: dataset containing ROI information for each mouse along with the 
+                  corresponding entry and exit time frames.
+        Returns: 
+            Reshaped version of data indicating ROIs information for each mouse and for each frame
+    """
+
+    data = delete_hiding_noise(data)
+    videos = data.video.unique()
+    new_dataset = pd.DataFrame()
+
+    for video in tqdm(videos):
+        for mouse in ['UM','LM']:
+            column_name = video + '-' + mouse
+            column = pd.Series([None for i in range(27000)], name = column_name)
+            for roi in rois:
+                data1 = data[data.video == video]
+                data2 = data1[data1.roi == roi]
+                data3 = data2[data2.animal == mouse]
+                if data3.shape[0] > 0:
+                    for i in range(data3.shape[0]):
+                        start = data3.entry.iloc[i]
+                        end = data3.exit.iloc[i]
+                        column.iloc[start:end+1] = [roi for j in range(start,end+1,1)]
+            new_dataset = pd.concat([new_dataset, column],axis=1)
+    return new_dataset
+
+
+
+def replace_names(rois_per_frame: pd.DataFrame):
+    """
+        Adjust roi names:
+        We temporarily employ new roi names by considering the up/down corners as <br>
+        part of their respective up/down compartments.
+        
+        Args:
+            rois_per_frame: data indicating ROIs information for each mouse and for each frame
+        Returns:
+            same dataset with simplified ROIs nomenclature
+    """
+    for old_name in ['Down-comp','Down-corn1','Down-corn2','Down-corn3','Down-corn4']:    
+        rois_per_frame.replace(old_name,'downcomp', inplace = True)
+    for old_name in ['Up-comp','Up-corn1','Up-corn2','Up-corn3','Up-corn4']:    
+        rois_per_frame.replace(old_name,'upcomp', inplace = True)
+    return rois_per_frame
+
+
+
+
+def cohabitation_distance(radius: int, times: str, rois_per_frame: pd.DataFrame) -> dict:
+    """
+    Computes portion of time spent in the same ROI and within radius distance
+    Args:
+        radius: max distance
+        times:  'T1' (early) or 'T2' (late)
+        rois_per_frame: data indicating ROIs information for each mouse and for each frame
+    Returns: 
+        time_close_dict: dictionary storing time spend close to each other
+                        by each recording pair for all strains.
+    
+    """
+    rois_per_frame = replace_names(rois_per_frame)
+    time_close_dict = {}
+    for line in ['CD1','Hyb','C57j']:
+        time_cose_line_dict = {}
+
+        for time in times: 
+            columns = [c for c in rois_per_frame.columns.tolist() if (line in c and time in c)]
+            n = len(columns)
+            time_close = []
+
+            for i in range(0,n,2):
+                video = columns[i][:-3] + '.csv'
+                data = d.get_data(video)
+                dist = d.compute_distance(data) / PIXEL_TO_CM
+                dist_filter = dist.apply(lambda x: 1 if x < radius else 0) 
+
+                cond_same_roi = rois_per_frame[columns[i]] == rois_per_frame[columns[i+1]]
+                cond_same_roi.apply(lambda x: 1 if x else 0)
+
+                num_frames_close = np.dot(cond_same_roi[0:dist.shape[0]],dist_filter)
+                tot_time = dist.shape[0]
+                time_close.append(num_frames_close/tot_time)
+
+            time_cose_line_dict[time] = time_close
+        time_close_dict[line] = time_cose_line_dict
+    return time_close_dict
+
+
+
+
+def plot_cohabitatio_distance(times: str, time_close_dict: dict):
+    """
+    Plots mean portion of time spent in the same ROI and within radius distance
+    for each line.
+     Args:
+        time_close_dict: dictionary storing time spend close to each other
+                        by each recording pair for all strains.
+        times:  'T1' (early) or 'T2' (late)
+    
+    """
+    plt.figure(figsize = (4,3))
+
+    for line in ['CD1','Hyb','C57j']:
+        means = []
+        for time in times:
+            means.append(np.mean(time_close_dict[line][time]))
+
+        plt.plot([0,1],means, label = line)
+        plt.scatter([0,1],means,s=100)
+        title = 'Radius distance: ' + str(radius) + 'cm'
+        plt.title(title)
+        plt.xticks(ticks = [i for i in range(len(times))],labels = times, fontsize = 15)
+        plt.ylim((0,0.5))
+        plt.xlim((-0.2,1.2))
+        plt.legend()
+    plt.show()
+
+
+
+try:
+    data = pd.read_csv('input_data.csv',low_memory=False)  # change path name for input data
+    rois_per_frame = create_rois_per_frame(data)
+    time_close_dict = cohabitation_distance(10,"T1", rois_per_frame)
+    plot_cohabitatio_distance("T1", time_close_dict)
+
+except:
+    print("Failed")
--- a/readme.html
+++ b/readme.html
--- a/readme.md
+++ b/readme.md
+
+
+# Induction of Territorial Behavior and Dominance Hierarchies in Laboratory Mice
+
+This repository contains the code used for analyzing the research described in the paper by
+Dorian Battivelli, Lucas Boldrini, Mohit Jaiswal, Pradnya Patil, Sofia Torchia, Elizabeth Engelen, Luca Spagnoletti, Sarah Kaspar and Cornelius T. Gross: "**Induction of territorial behavior and dominance hierarchies in laboratory mice**".<br>
+You can explore the data associated with this study [here]().
+
+#### Abstract
+
+Territorial behaviors comprise a set of coordinated actions and response patterns found across animal species that promote the exclusive access to resources. House mice are highly territorial with a subset of males consistently attacking and chasing competing males to expel them from their territories and performing urine marking behaviors to signal the extent of their territories. Natural variation in territorial behaviors within a mouse colony leads to the formation of dominance hierarchies in which subordinate males can reside within the territory of a dominant male. While the full repertoire of such territorial behaviors and hierarchies has been extensively studied in wild-derived mice in semi-natural enclosures, so far they have not been established in the smaller enclosures and with the genetically-defined laboratory strains required for the application of neural recording and manipulation methods. Here, we present a protocol to induce an extensive repertoire of territorial behaviors in small enclosures in laboratory mice, including a method for the simultaneous tracking of urine marking behavior in mouse pairs. Using this protocol we describe the emergence of robust dominant-subordinate hierarchies between pairs of CD1 outbred or CD1xB6 F1 hybrid mice, but  unexpectedly not in C57BL/6 inbred animals. Our behavioral paradigm opens the door for neurocircuit studies of territorial behaviors and social hierarchy in the laboratory.
+
+## Overview
+
+The repository includes code for the following analyses: <br>
+1. Average Distance and Speed:
+    - ```window_analysis.py``` computes the distance between two mice and the speed of a single mouse over time.
+    - ```plot_window.py``` generates plots showing overlapping distance or speed curves extracted from windows centered around all flight onsets.
+2. Permutation test for comparison with baseline interaction
+3. Unbiased Dominance Score via PCA Analysis
+4. Proximity measure
+5. Time Spent Hiding
+
+
+### 1) Average distance and speed 
+```window_analysis.py``` implements the necessary steps to compute distance between two mice and speed of a mouse over time, while ```plot_window.py``` produces plots for overlapping distance or speed curves extracted from windows centered around all flight onsets. <br>
+Here, input data (behavioral recordings and coordinates
+files) are a .csv files stored in a folder named respectively
+"Data" and "Behavioral scoring". We also assign each mouse a dominance status ("dominant" or "subordinate") through ```Labels.csv```.
+
+In order to run this code, organize data and code as follows:
+- Data:
+    - ```CD1-B1-MS_US-T1.1.csv```
+    - ```CD1-B1-MS_US-T1.2.csv```
+    - ...  
+- Behavioral scoring:
+    - ```CD1-B1-MS_US-T1.1.csv```
+    - ```CD1-B1-MS_US-T1.2.csv```
+    - ...
+- ```Labels.csv```
+- ```window_analysis.py```
+- ```plot_window.py```
+
+
+    
+Each mouse is originally identified by the x and y coordinates of eight points on its body. These coordinates are tracked over the whole recording and stored in a .csv file in "Data":
+
+| <span style="color:grey">UM_Ear_left_1_x</span> | <span style="color:grey">UM_Ear_left_1_y</span> | <span style="color:grey">UM_Ear_left_1_p</span> | <span style="color:grey">UM_Nose_1_x</span> | <span style="color:grey">UM_Nose_1_y</span> | ... | <span style="color:grey">LM_Lat_left_2_p</span> | <span style="color:grey">LM_Lat_right_2_x</span> | <span style="color:grey">LM_Lat_right_2_y</span> | <span style="color:grey">LM_Lat_right_2_p</span> | <span style="color:grey">LM_Tail_base_2_x</span> | <span style="color:grey">LM_Tail_base_2_y</span> | <span style="color:grey">LM_Tail_base_2_p</span> |
+|-------------------------|-------------------------|-------------------------|--------------------|--------------------|-----|-------------------------|--------------------------|--------------------------|--------------------------|---------------------------|---------------------------|---------------------------|
+| 1174.5143               | 247.42857               | 1.0                     | 1139.6571          | 258.20000          | ... | 1.0                     | 1968.7428                | 1937.9714                | 1.0                      | 1932.0857                | 1974.3429                | 0.999257                  |
+| 1175.5714               | 248.74286               | 1.0                     | 1139.6000          | 254.71428          | ... | 1.0                     | 1968.1714                | 1935.8572                | 1.0                      | 1931.7428                | 1974.4857                | 0.999743                  |
+| 1176.0000               | 249.91429               | 1.0                     | 1139.4000          | 252.65715          | ... | 1.0                     | 1968.1714                | 1935.1714                | 1.0                      | 1931.2572                | 1974.3429                | 1.000086                  |
+| 1174.8286               | 250.17143               | 1.0                     | 1139.2572          | 253.82857          | ... | 1.0                     | 1968.8286                | 1936.5714                | 1.0                      | 1930.9143                | 1973.9143                | 1.000000                  |
+| 1174.6857               | 249.82857               | 1.0                     | 1139.2572          | 253.74286          | ... | 1.0                     | 1968.8286                | 1936.9143                | 1.0                      | 1931.0857                | 1973.9143                | 1.000000                  |
+|...|...|...|...|...|...|...|...|...|...|...|...|...|...|
+
+
+
+
+Each behavioral annotation file in "Behavioral scoring" is similar to:
+|     |<span style="color:grey">Time</span>  |  <span style="color:grey">Default</span>|
+| -   | - |  -|
+|0    |0.00   | NaN|
+|1    |0.20  |  NaN|
+|2    |0.60  |  C-M2-C1 |    # mouse M2 chases (C) mouse M1 in compartment C1
+|3    |0.80  |  NaN|
+...  |...    | ...|
+
+```Labels.csv``` has the following structure:
+|    |    <span style="color:grey">Video</span>                   |  <span style="color:grey">Dominant</span> |    <span style="color:grey">Subordinate</span> |   <span style="color:grey">Start_frame</span>|
+|-|-|-|-|-|
+|0     |  C57j-B1-MS_US-T1.1.csv  |  M2        |   M1          |    495|
+|1   |    C57j-B1-UN_LS-T1.1.csv  |  M1       |    M2          |    0|
+|2   |    C57j-B2-MS_US-T1.1.csv  |  M1       |    M2         |     675|
+|3   |    C57j-B2-UN_LS-T1.1.csv  |  M2       |    M1          |    0|
+|... |    ...                     |  ...      |    ...         |    ...|
+
+
+Behavior is annotated at 5 frames per second, 
+while coordinates over time are extracted at 45 frames per second. For each behavioral scoring file the first frames are discarded; ```Labels.csv```stores the actual initial frame for these files inside the column "Start_frame".
+
+### 2) Permutation test for comparison with baseline interaction
+The permutation tests run on an `.xlsx` data file, which has the columns `Mouse_line`, `pair`, and one column for each behavior to be tested.
+
+### 3) Unbiased dominance score via PCA analysis
+File organization:
+- pca_analysis.py
+- pca_dataset.csv
+
+
+
+
+
+
+The script pca_analysis.py operates on pca_dataset.csv which has the following structure:
+
+| | <span style="color:grey">video&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;</span>|	<span style="color:grey">chases</span>|<span style="color:grey">attacks</span>|<span style="color:grey">other_comp</span>|<span style="color:grey">other_cage</span>|<span style="color:grey">locomotion</span>|	<span style="color:grey">ur_postures</span>|<span style="color:grey">flights</span>|<span style="color:grey">hiding</span>|<span style="color:grey">mouse</span>|<span style="color:grey">line</span>|<span style="color:grey">time</span>|<span style="color:grey">interaction</span>|
+|-  |-              | -|-        |-      |     -|-             |-       |-       |-      |-    | -  |-     |-    |
+|0 |CD1-B1-UN_LS-T1|0.0| 	25.0 |	9.5 |	0.0 |	15293.8694 |	6.0 |	10.0 |	21.5 |	UN |CD1 |	T1 |	1|
+|1 |CD1-B1-UN_LS-T1|15.0| 	32.0 |	50.5 |	12.0 |	12501.2516| 	0.0 |	1.0 |	12.5 |LS 	|CD1| 	T1 |   1|
+|2 |CD1-B1-MS_US-T1|2.0 |	25.0 |	41.5 |	2.0 |	15990.6214 |	0.0 |	0.0 |	23.0 |	MS 	|CD1 |	T1 | 	1|
+|3 |CD1-B1-MS_US-T1|6.0| 	23.0 |	32.0 |	1.0 |	13421.5625 |	0.0| 	5.0 |	22.5 |	US 	|CD1 |	T1 |	1|
+|...|...|...|...|...|...|...|...|...|...|...|...|...|...|
+
+
+### 4 - 5) Proximity measure and Hiding analysis
+
+Both analysis require the following data format:
+
+| |<span style="color:grey">video</span>| 	<span style="color:grey">animal</span> |	<span style="color:grey">roi</span>| 	<span style="color:grey">entry</span>| 	<span style="color:grey">exit</span>|
+|-|-|-|-|-|-|
+|0 |	C57j-B6-US_MS |	UM |	Up-corn1 |	1214 |	1477|
+|1 |	C57j-B6-US_MS| 	UM |	Up-corn1 |	1690 |	2500|
+|2 |	C57j-B6-US_MS |	UM |	Up-corn1 |	2570 |	2598|
+|3 |	C57j-B6-US_MS |	UM |	Up-corn1 |	3688 |	3737|
+
+The script for "proximity measure" quantifies the time two mice from the same pair spend close to each other. "Close" is defined by being in the same ROI and having an Euclidean distance less than a predefined threshold (here 10 centimeters), referred to as the "radius."<br>
+The script for "hiding" computer for each mouse belonging to the same pair: 
+- the total time of the recording
+- the total time spent hiding
+- the portion of time spent in various regions (e.g. other compartment, other nest, other HF) <br><br>
+
+
+
+
+## Running this code
+Ensure Python version 3.10.0 is installed to execute the provided scripts successfully, and that all libraries listed in ```requirements.txt``` are available with specified versions. 
+You can do so by cloning this repository and running the following commands: <br>
+
+```
+conda create --name behavior_analysis python==3.10
+conda activate behavior_analysis
+pip install -r requirements.txt
+```
+You can now run the analysis for speed and distance from the repository directory by typing:
+```
+python plot_window.py
+```
+And the PCA exploration for dominance labeling with:
+```
+python pca_analysis.py
+```
+
+### Permutation tests
+
+
+For running the analysis 
+
+- make sure you have the libraries `tidyverse` and `readxl` installed 
+- navigate to the folder that contains `permutations.R` 
+
+Run the script as follows:
+
+```console
+Rscript --vanilla permutations.R INPUT_FILE OUTPUT_FILE
+```
+ 
+where `INPUT_FILE` gives the path to an `.xlsx` table containing the data to be analyzed, and `OUTPUT_FILE` is the path to store the results. The output file should be `.tsv`.
+
+Optionally, you can provide a seed as the third argument, and the number of permutations as the fourth argument. For example:
+
+```console
+Rscript --vanilla permutations.R data/T1.xlsx results/T1-results.tsv 22 1000
+```
+
+For this publication, the analysis was run with `seed = 21` and `N=10000` permutations, with R version 4.3.0 under macOS 14.4.1.
+
+## Authors
+Sofia Torchia <br>
+Sarah Kaspar
+
+
+
+
+
--- a/requirements.txt
+++ b/requirements.txt
+matplotlib == 3.7.1
+numpy == 1.24.3
+pandas == 2.0.1
+scipy == 1.10.1
+scikit-learn == 1.2.2
\ No newline at end of file
--- a/window_analysis.py
+++ b/window_analysis.py
+""" 
+Description:
+This module implements the necessary steps to compute
+distance between two mice and speed of a mouse over time.
+
+
+File organization:
+Here, we assume that behavioral recordings and coordinates
+files are a .csv file stored in a folder named respectively
+Data and Behavioral scoring.
+We also assign each mouse a dominance status ("dominant" or "subordinate") 
+through Labels.csv
+
+In order to run this code, data and code have to be organized as follows:
+- Data:
+    - CD1-B1-MS_US-T1.1.csv
+    - CD1-B1-MS_US-T1.2.csv
+    - ...  
+- Behavioral scoring:
+    - CD1-B1-MS_US-T1.1.csv
+    - CD1-B1-MS_US-T1.2.csv
+    - ...
+- Labels.csv
+- window_analysis.py
+- plot_window.py
+
+    
+    
+
+Data structure:
+Each coordinate file in Data has a structure similar to:
+        MS_Ear_left_1_x      MS_Ear_left_1_y       ...           US_Tail_end_2_y
+0       1174.5143            247.42857             ...           1964.00000
+1       1175.5714            248.74286             ...           1964.08570
+2       1176.0000            249.91429             ...           1963.74280
+3       1174.8286            250.17143             ...           1963.25720
+4       1174.6857            249.82857             ...           1962.82860
+...     ...                  ...                   ...           ...
+
+Each behavioral file in Behavioral scoring folder is similar to:
+     Time    Default
+0    0.00    NaN
+1    0.20    NaN
+2    0.40    NaN
+3    0.60    C-M2-C1     # mouse M2 chases (C) mouse M1 in compartment C1
+4    0.80    NaN
+...  ...     ...
+
+Labels.csv has the following structure:
+        Video                     Dominant     Subordinate    Start_frame
+0       C57j-B1-MS_US-T1.1.csv    M2           M1              495
+1       C57j-B1-UN_LS-T1.1.csv    M1           M2              0
+2       C57j-B2-MS_US-T1.1.csv    M1           M2              675
+3       C57j-B2-UN_LS-T1.1.csv    M2           M1              0
+4       C57j-B3-MS_US-T1.1.csv    M1           M2              450
+...     ...                       ...          ...             ...
+
+Frame rate:
+Behavioral scoring is annotated at 5 frames per second, 
+while coordinates over time are extracted at 45 frames per second.
+
+
+"""
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+from matplotlib import rcParams
+
+
+MOUSE1 = 'M1'
+MOUSE2 = 'M2'
+RIGHT = 60
+LEFT = 30
+FRAMES_PER_SECOND = 45
+FRAME_RATE_ADAPT = 9
+PIXEL_TO_CM = 15.4
+MAX_LENGTH = 3000
+
+BEHAVIORS = ['Fl','A','C','U']  # list of annotated behaviors
+LABELS_PATH = 'Labels.csv'    # stores dominance status for all mice
+
+
+'''
+ TODO:
+1) delete unused functions
+3) constants
+4) add comments and examples
+5) find better names for functions
+6) strain or line or both?
+10) function order
+13) append or extend
+'''
+
+
+
+def get_data(video: str) -> pd.DataFrame:
+    '''
+        This function processes a given video recording and extracts 
+        the positional data of two mice over time. 
+        Specifically, the output dataset contains the coordinates 
+        along the x and y axes for the eight points identifying each mouse.
+        
+        Args:
+            video: name of recording stored in Data folder
+        Returns: 
+            pandas dataframe corresponding to mice coordinates extracted
+            from the recording
+    '''
+    data = pd.read_csv('Data/' + video, header = 0)
+    return data
+
+
+
+def compute_distance(data: pd.DataFrame) -> pd.Series:
+    '''
+        This function computes the euclidean distance over time between MOUSE1 and MOUSE2.
+
+        Each mouse is originally identified by the x and y coordinates
+        of eight points on its body, for instance mouse1 is identified by
+        mouse1_x and mouse1_y data.
+        For each mouse we compute its mean position over time along
+        x and y coordinates, (for example: pos_mouse_1_x and pos_mouse_1_y for mouse1)
+        so that one mouse is identified only by a single two-dimensional point.
+        Finally, the euclidean distance between these 2 two-dimensional point over time is returned.
+        
+        Args:
+            data: dataset with mice coordinates
+        Returns:
+            time series corresponding to distance between mice over time
+    '''
+    mouse1_x, mouse1_y = get_column_names(data, MOUSE1)
+    mouse2_x, mouse2_y = get_column_names(data, MOUSE2)
+
+    pos_mouse_1_x = data[mouse1_x].mean(axis = 1).values
+    pos_mouse_1_y = data[mouse1_y].mean(axis = 1).values
+    pos_mouse_1 = pd.DataFrame({'pos_x':pos_mouse_1_x,'pos_y':pos_mouse_1_y})
+
+    pos_mouse_2_x = data[mouse2_x].mean(axis = 1).values
+    pos_mouse_2_y = data[mouse2_y].mean(axis = 1).values
+    pos_mouse_2 = pd.DataFrame({'pos_x':pos_mouse_2_x,'pos_y':pos_mouse_2_y})
+
+    return ((pos_mouse_1 - pos_mouse_2)**2).sum(axis=1)**0.5
+
+
+
+def get_column_names(data: pd.DataFrame, mouse_label: str) -> pd.DataFrame:
+    '''
+       Each mouse is originally identified by the x and y coordinates
+       of eight points on its body, such as tail_base, tail_end,
+       ear_left, ear_right, etc.
+       Given a mouse_label (MOUSE1 or MOUSE2), this function returns
+       the columns names of the x and y coordinates of these eigth points.
+       
+       Args: 
+           data: dataset with mice coordinates over time
+           mouse_label: name of one specific mouse
+       Returns:
+           colums of dataset corresponding to coordinates
+           of one specific mouse
+    '''
+    if mouse_label == MOUSE1:
+        columns =  get_column_names_lbl(data, '_1_x', '_1_y')
+    if mouse_label == MOUSE2:
+        columns =  get_column_names_lbl(data, '_2_x', '_2_y')
+    return columns
+
+
+def get_column_names_lbl(data: pd.DataFrame, x_lbl: str, y_lbl: str) -> tuple[list, list]:
+    '''
+        Returns list of data columns name related to x and y coordinates of a mouse.
+        
+        Args:
+            data: dataset with mice coordinates over time
+            x_lbl: label for x coordinates
+            y_lbl: label for y coordinates
+        
+        Returns:
+            mouse-specific columns names for x and y coordinates
+    '''
+    mouse_x = [x for x in data.columns if x_lbl in str(x)]
+    mouse_y = [x for x in data.columns if y_lbl in str(x)]
+    return mouse_x, mouse_y
+
+
+
+
+def compute_speed(data : pd.DataFrame, mouse_label: str) -> pd.Series:
+    '''
+        This function computes the speed over time of a mouse.
+
+        Each mouse is originally identified by the x and y coordinates of
+        eight points on its body: mouse_x and mouse_y.
+        First, these eight points are used to compute a mean two-dimensional
+        point on the mouse with coordinates pos_mouse_x and pos_mouse_y.
+        The mouse speed at time t is computed as the variation
+        of the mouse position from time t1=t-FRAMES_PER_SECOND to time t2=t
+        
+        Args:
+           data: dataset with mice coordinates over time
+           mouse_label: name of one specific mouse
+
+        Returns:
+           speed of mouse over time
+    '''
+
+    mouse_x, mouse_y = get_column_names(data,mouse_label)
+
+    pos_mouse_x = data[mouse_x].mean(axis = 1)
+    pos_mouse_y = data[mouse_y].mean(axis = 1)
+
+    vel_mouse_x = pos_mouse_x - pos_mouse_x.shift(FRAMES_PER_SECOND)
+    vel_mouse_y = pos_mouse_y - pos_mouse_y.shift(FRAMES_PER_SECOND)
+
+    speed_mouse = (vel_mouse_x**2 + vel_mouse_y**2).apply(lambda x: np.sqrt(x))
+    return speed_mouse
+
+
+
+
+def get_dominant(video: str) -> str:
+    '''
+        This function returns the dominant mouse name
+        (M1 or M2) for each couple
+        
+        Args:
+            video: name of a recording
+
+        Returns:
+            name of dominant mouse in the couple
+    '''
+    labels = pd.read_csv(LABELS_PATH)
+    return labels[labels.Video == video]['Dominant'].values[0]
+
+
+
+
+def get_subordinate(video: str) -> str:
+    '''
+        This function returns the subordinate mouse name
+        (M1 or M2) for each couple
+
+        Args:
+            video: name of a recording
+
+        Returns:
+            name of subordinate mouse in the couple
+
+    '''
+    labels = pd.read_csv(LABELS_PATH)
+    return labels[labels.Video == video]['Subordinate'].values[0]
+
+
+
+def get_video_list() -> list:
+    '''
+        This function returns a list of the couples for which
+        a dominant/subordinate labeling is available.
+    '''
+    labels = pd.read_csv(LABELS_PATH)
+    return labels.Video.values.tolist()
+
+
+
+
+def compute_window(video_list: list, target_behavior: str, left_length: int,
+                   right_length: int, curve_to_visualize: str,
+                   align_criterion: str=None) -> tuple[list, dict]:
+    '''
+        For each video of mice,
+        this function produces windows from the time series curve_to_visualize
+        of length l = left_length + right_length + 1
+        centered in all events occurred for a particular behavior (chase or flight)
+        taking into account the status of the fleeing mouse (dominant or subordinate)
+        The selected windows are aligned following a criterion (align_criterion)
+        and returned together with the mean curve.
+        
+        Args:
+            video_list: list of recordings from which distance or speed curves
+                are extracted
+            target_behavior: specifies the kind of behavioral event around which 
+                distance and speed windows are computed
+            left_length: right length of the window
+            right_length: left length of the window
+            curve_to_visualize: curves to be visualized in the plot (speed or distace) 
+            align_criterion: re-calculation of the 'zero' of the windows based on
+                a specific criterion
+
+        Returns:
+            list of curves to visualize
+    '''
+    curve_list = []
+
+    for video in video_list:
+        new_list = compute_window_video(left_length, right_length,
+                                        video, target_behavior,
+                                        curve_to_visualize,
+                                        align_criterion)
+        curve_list.extend(new_list)
+
+    curve_list = np.array(curve_list)
+    return curve_list
+
+
+
+
+def compute_window_video(left_length: int, right_length: int,
+                         video, target_behavior: str, curve_to_visualize: str,
+                         align_criterion: str=None) -> tuple[list, dict]:
+    '''
+        Given a time_series and a set of indices, this function produces a window
+        of length l = left_length + right_length + 1
+        centered in all events occurred for a particular behavior (chase or flight)
+        considering the status of the fleeing mouse (dominant or subordinate)
+        The selected windows are aligned together following a criterion (align_criterion)
+        and returned together with the mean curve.
+        
+        Args:
+            video: recording from which distance or speed curves
+                are extracted
+            target_behavior: specifies the kind of behavioral event around which 
+                distance and speed windows are computed
+            left_length: right length of the window
+            right_length: left length of the window
+            curve_to_visualize: curves to be visualized in the plot (speed or distace) 
+            align_criterion: re-calculation of the 'zero' of the windows based on
+                a specific criterion
+
+        Returns:
+            list of curves to visualize
+    '''
+    
+    dominant = get_dominant(video)
+    subordinate = get_subordinate(video)
+    data = get_data(video)
+    behavioral_scoring_data = get_behavioral_scoring_data(video, ['Fl','C'])
+    speed_subordinate = compute_speed(data,subordinate) / PIXEL_TO_CM
+    
+
+    frames = get_flights(dominant, subordinate,
+                         behavioral_scoring_data, target_behavior) 
+    new_frames = indices_by_align_criterion(speed_subordinate,
+                                            frames, align_criterion)
+    time_series = get_curve_to_visualize(curve_to_visualize, data, video)
+
+    curve_list = []
+    cumsum = 0
+
+    dataset = time_series
+    for i in range(- left_length, right_length + 1, 1):
+        new_col = time_series.shift(-i)
+        new_col = new_col.rename(f'shift{i}')
+        dataset = pd.concat([dataset, new_col], axis = 1)
+    dataset.drop(columns = [0], inplace = True)
+
+    for frame in new_frames:
+        curve = dataset.loc[frame]
+        cumsum = cumsum + curve['shift0']
+        curve_list.append(list(curve))
+
+    return curve_list
+
+
+
+
+def get_curve_to_visualize(curve_to_visualize: str,
+                           data: pd.DataFrame, video: str) -> pd.DataFrame:
+    '''
+        Return time series to be decomposed in compute_window(...) and represented
+        through plot_window(...)
+        
+        Args: 
+            curve_to_visualize: name of the curve to use for window computation
+            data: set of mice coordinates to compute distance or speed over time
+            video: name of the recording from which the mice coordinates are
+                extracted.
+        Returns:
+            time series from which windows are computed
+    '''
+    if curve_to_visualize == 'distance':
+        curve =  compute_distance(data) / PIXEL_TO_CM
+    if curve_to_visualize == 'speed subordinate mouse':
+        curve =  compute_speed(data,get_subordinate(video)) / PIXEL_TO_CM
+    if curve_to_visualize == 'speed dominant mouse':
+        curve =  compute_speed(data,get_dominant(video))  / PIXEL_TO_CM
+    return curve
+
+
+
+
+def adapt_frame_rate(old_dataset: pd.Series, behavior_list: list) -> pd.DataFrame:
+    '''
+        This function performs an oversampling of the behavioral dataset (old_dataset)
+        from 5 frames per second to 45 frames per second, such that no behavior event
+        is duplicated.
+
+        Args:
+            old_dataset: raw dataset with behavioral annotation (frame rate of 5Hz)
+            behavioral_list: list of all possible behavioral annotations
+        Returns:
+            oversampled behavioral dataset with frame rate of 45Hz
+
+        Example:
+        old_dataset = [None, 'Flight', None]
+        new = [None, None, None, None, None, None, None, None, None,
+               'Flight', None, None, None, None, None, None, None, None,
+               None, None, None, None, None, None, None, None, None]
+    '''
+    new = []
+    for i in range(old_dataset.shape[0]):
+        if old_dataset[i] in behavior_list:
+            new.append(old_dataset[i])
+            new.extend([None for _ in range(FRAME_RATE_ADAPT-1)])
+        else:
+            new.extend([None for _ in range(FRAME_RATE_ADAPT)])
+    return pd.DataFrame({'Default': new})
+
+
+
+
+def get_behavioral_scoring_data(video: str, behavior_list: list):
+    '''
+        Givena a couple (video), this function returns
+        the dataset containing the behavioral scoring, which is at 5 Hz. 
+    '''
+    scoring_list = get_scoring_list(behavior_list)
+    behavioral_scoring_data = pd.read_csv('Behavioral scoring/' + video)
+
+    cond = behavioral_scoring_data.Time == 'Frequencies'
+    right_length_index = behavioral_scoring_data[cond].index[0] - 1
+
+    labels = pd.read_csv(LABELS_PATH)
+    init_index = labels[labels.Video == video]['Start_frame'].values[0]
+    
+    right_length_index = min(right_length_index, init_index + MAX_LENGTH) 
+   
+
+    behavioral_scoring_data = behavioral_scoring_data[init_index:right_length_index].reset_index()
+    behavioral_scoring_data = adapt_frame_rate(behavioral_scoring_data.Default,scoring_list)
+    return behavioral_scoring_data
+
+
+
+def index_at_behavior(beh: str, mouse: str, dataset: pd.Series) -> list:
+    '''
+        Given a mouse and a behavior label (beh), this function returns
+        the indices at which the behavioral event occurs for the given
+        mouse in both compartments (C1 and C2).
+        
+        Args: 
+            beh: name of bahavior
+            mouse: name of mouse
+            dataset: dataset containing mice coordinates for a single recording
+        Returns:
+            list of indices where all behavioral event of the same kind and
+            for specified mouse occur in the recording
+    '''
+    indices = []
+    for comp in ['-C1','-C2']:
+        condition = dataset == beh + '-' + mouse + comp
+        ind = dataset[condition].index.values.tolist()
+        indices.extend(ind)
+    return indices
+
+
+
+
+def get_scoring_list(behaviors: list) -> list:
+    '''
+        Returns all (complete) behavior labels for the given behaviors in input.
+        
+        Args: 
+            behaviors: list of behavior names
+        Returns:
+            all possible behavioral annotation for input behavior names
+            (including information about the mouse doing the behavior 
+            and the compartment in which the event occurs)
+
+        Example:
+        behaviors = ['Fl','C']
+        behavior_list = ['Fl-M1-C1','Fl-M1-C2','Fl-M2-C1','Fl-M2-C2',
+                         'C-M1-C1','C-M1-C2','C-M2-C1','C-M2-C2']
+    '''
+    behavior_list = []
+    for mouse in ['-M1','-M2']:
+        for comp in ['-C1','-C2']:
+            for behavior in behaviors:
+                behavior_list.append(behavior + mouse + comp)
+    return behavior_list
+
+
+
+
+def get_flights(dominant: str, subordinate: str, dataset: pd.Series, behavior: str) -> list:
+    '''
+        Returns the event indices for:
+        - all flights occurred for mouse_for_flights
+        - all flights occurred for mouse_for_flights when this is NOT chased
+          by mouse_for_chases whithin a time span given by time_span
+        - all flights occurred for mouse_for_flights ONLY when this is chased
+          by mouse_for_chases whithin a time span given by time_span
+          
+    Args: 
+        dominant: name of dominant mouse
+        subordinate: name of subordinate mouse
+        dataset: dataset with behavioral annotations
+        behavior: type of behavioral events
+    Returns:
+        list of frames when the specified type of behavior occurs
+    '''
+    time_span = FRAMES_PER_SECOND*2
+    for i in range(-time_span,1,1):
+        new_col = dataset.Default.shift(-i)
+        new_col = new_col.rename('shift' + str(i))
+        dataset = pd.concat([dataset, new_col], axis = 1)
+
+    cond1 = dataset.Default == 'Fl-' + subordinate + '-C1'
+    cond2 = dataset.Default == 'Fl-' + subordinate + '-C2'
+
+    filtered = dataset[cond1 | cond2].drop(columns = ['Default'])
+    cond3 = filtered.where(filtered == 'C-' + dominant + '-C2')
+    cond4 = filtered.where(filtered == 'C-' + dominant + '-C1')
+    flights_with_chases = filtered[cond3.any(axis = 1)].index.tolist()
+    flights_with_chases = flights_with_chases + filtered[cond4.any(axis = 1)].index.tolist()
+
+    if behavior == 'flights_with_chases':
+        return flights_with_chases
+
+    all_flights = index_at_behavior('Fl', subordinate, dataset.Default)
+    if behavior == 'all_flights':
+        return all_flights
+
+    if behavior == 'flights_without_chases':
+        return [f for f in all_flights if f not in flights_with_chases]
+
+
+
+
+
+
+
+def indices_by_align_criterion(time_series: pd.Series,
+                               frames: list, align_criterion: str) -> list:
+    '''
+        This function takes as input a time series and a set of indices,
+        selects a window of the time series around each index and
+        computes a new index based on one of the following criteria:
+            - None: the previous index is returned
+            - peak: the new index is the point of maximum of the time series
+            inside the window
+            - max_slope: the new index is the point of maximun slope of the time series
+            inside the window
+        
+        Args: 
+            time_series: time series based on which the new frames are computed
+            frames: list of old frames
+            align_criterion: criterion for calculation of the new set of frames
+        Returns:
+            list of new frames
+    '''
+    if align_criterion:
+        new_frames = []
+        for frame in frames:
+            curve =  time_series.loc[frame - LEFT:frame + RIGHT]
+            if align_criterion == 'peak':
+                new_frame = curve.argmax() + max(frame - LEFT,0)
+            if align_criterion == 'max_slope':
+                gradient = np.gradient(curve)
+                new_frame = gradient.argmax() + max(frame - LEFT,0)
+            new_frames.append(new_frame)
+    else:
+        new_frames = frames
+    return new_frames
+
+