diff --git a/Makefile b/Makefile index c21a27a..027d500 100644 --- a/Makefile +++ b/Makefile @@ -1,12 +1,13 @@ DEBUG ?= 0 INSTALL_PREFIX ?= /usr/local +LINK_MPI_CXX ?= MAKE = make BUILD = build SRC = src -QMAKEFLAGS = PREFIX=$(INSTALL_PREFIX) +QMAKEFLAGS = PREFIX=$(INSTALL_PREFIX) LINK_MPI_CXX=$(LINK_MPI_CXX) ifeq ($(DEBUG), 1) QMAKEFLAGS += CONFIG+=debug diff --git a/bin/kinc-3d-viewer.py b/bin/kinc-3d-viewer.py index 452e794..28ff318 100755 --- a/bin/kinc-3d-viewer.py +++ b/bin/kinc-3d-viewer.py @@ -2,40 +2,7 @@ """ Creates a Dash application that provides 3D visualization of a KINC network. -This script accepts the following arguments: - - --net : (required) The path to the KINC-derived network file - - --emx : (retuired) The path to the log2 transformed Gene Expression Matrix - or Metabolite abundance matrix. - - --amx : (required) The path to the tab-delimited annotation matrix. - The matrix must have at least one column that contains a unique - list of sample names. - - --nmeta : (optional) The path to a tab-delimited node meta data file. The - format of the file must have 4 columns, with the first - containing the node name, the second a controlled vocabulary - term ID, the second the term definition and the fourth the vocubulary name. - - --sample_col: (optional) The name of the column in the annotation matrix - that contains the unique sample names. Defaults to "Sample" - - --debug: (optional). Add this argument to enable Dash application - debugging mode. - - --iterations: (optional). The number of iterations to perform when - calculating the Force Atlas2 layout. This argument is only - used the first time a network is viewed or if the - --redo_layout argument is provided. - - --redo-layout : (optional). If the 2D and 3D network layout has already - been constructed it will be loaded from a file. Add this - arugment to force the layouts to be rebuilt and not loaded - from the files. To prevent Dash from rerunning the layout - on callbacks, this option results in the program - terminating. To view the application, restart without - this option. +For usage instructions run this script with the --help flag. """ @@ -56,7 +23,9 @@ import re import ast import time +import base64 from progress.bar import IncrementalBar +import socket @@ -131,7 +100,7 @@ def load_node_meta(file_path): Imports the tab-delimited node metadata file. The format of the file must have 4 columns, with the first containing the - node name, the second a controlled vocabulary term ID, the second the + node name, the second a controlled vocabulary term ID, the third the term definition and the fourth the vocubulary name. """ nmeta = pd.read_csv(file_path, sep="\t") @@ -198,6 +167,7 @@ def calculate_2d_layout(net, net_prefix, redo_layout, iterations): """ g = get_iGraph(net) + g.simplify() t = pd.Series(g.transitivity_local_undirected(), index=g.vs['name']) d = pd.DataFrame(g.degree(), index=g.vs['name'], columns=['Degree']) @@ -253,6 +223,8 @@ def bin_edges(net): """ net['Edge_Bin'] = np.around(np.abs(net['Similarity_Score']), decimals=2) net['Pval_Bin'] = np.round(-np.log10(net['p_value'])) + if 'hotelling_p_value' in net.columns: + net['HPval_Bin'] = np.round(-np.log10(net['hotelling_p_value'])) if (net['r_squared'].dtype == 'object'): net['Rsqr_Bin'] = 0 else: @@ -291,10 +263,13 @@ def find_vlayers(row, vtype='Source', bar=None): node = glayout.loc[row[vtype]] ebin = row['Edge_Bin'] pbin = row['Pval_Bin'] + hpbin = np.nan + if ('HPval_Bin' in row.index): + hpbin = row['HPval_Bin'] rbin = row['Rsqr_Bin'] rel = row['Relationship'] test = row['Test_Name'] - return(row[vtype], node['X'], node['Y'], ebin, pbin, rbin, rel, test, node['Degree'], node['CC']) + return(row[vtype], node['X'], node['Y'], ebin, pbin, hpbin, rbin, rel, test, node['Degree'], node['CC']) if (redo_layout | (not os.path.exists(net_prefix + '.3Dvlayers.txt'))): @@ -304,7 +279,7 @@ def find_vlayers(row, vtype='Source', bar=None): ltarget = net.apply(find_vlayers, vtype='Target', bar=bar, axis=1) print("") - columns = ['Vertex', 'X', 'Y', 'EBin', 'PBin', 'RBin', 'Rel', 'Test_Name', 'Degree', 'CC'] + columns = ['Vertex', 'X', 'Y', 'EBin', 'PBin', 'HPBin', 'RBin', 'Rel', 'Test_Name', 'Degree', 'CC'] vlayers = pd.DataFrame.from_records(lsource.append(ltarget).values, columns=columns) vlayers = vlayers[vlayers.duplicated() == False] # We want to place the node in the layer where it first appears. @@ -350,6 +325,9 @@ def place_elayers(row, bar = None): bar.next() ebin = row['Edge_Bin'] pbin = row['Pval_Bin'] + hpbin = np.nan + if ('HPval_Bin' in row.index): + hpbin = row['HPval_Bin'] rbin = row['Rsqr_Bin'] rel = row['Relationship'] test = row['Test_Name'] @@ -360,7 +338,7 @@ def place_elayers(row, bar = None): row["Source"], row["Target"], row["Samples"], - ebin, pbin, rbin, rel, test]) + ebin, pbin, hpbin, rbin, rel, test]) if (redo_layout | (not os.path.exists(net_prefix + '.3Delayers.txt'))): print("Calculating 3D edge layout.") @@ -368,7 +346,7 @@ def place_elayers(row, bar = None): ledge = net.apply(place_elayers, bar=bar, axis=1) print("") - elayers = pd.DataFrame.from_records(ledge, columns=['X', 'Y', 'Source', 'Target', 'Samples', 'EBin', 'PBin', 'RBin', 'Rel', 'Test_Name']) + elayers = pd.DataFrame.from_records(ledge, columns=['X', 'Y', 'Source', 'Target', 'Samples', 'EBin', 'PBin', 'HPBin', 'RBin', 'Rel', 'Test_Name']) elayers['name'] = elayers['Source'] + " (co) " + elayers['Target'] elayers.to_csv(net_prefix + '.3Delayers.txt') else: @@ -409,6 +387,8 @@ def create_network_plot(net, vlayers, elayers, color_by = 'Score', layer_by = 'S Z = vlayers['EBin'] if layer_by == 'P-value': Z = vlayers['PBin'] + if layer_by == 'Hotelling P-value (phased)': + Z = vlayers['HPBin'] if layer_by == 'R^2': Z = vlayers['RBin'] if layer_by == 'Test Name': @@ -422,6 +402,7 @@ def create_network_plot(net, vlayers, elayers, color_by = 'Score', layer_by = 'S marker=dict(symbol='circle', size=np.log10(vlayers['Degree'])*4, line=dict(width=1, color="#888888")), text="Node: " + vlayers['Vertex'], + customdata=vlayers['Vertex'], hoverinfo='text', name='Nodes')]) # Add the edges and bin them @@ -430,6 +411,8 @@ def create_network_plot(net, vlayers, elayers, color_by = 'Score', layer_by = 'S slider_title = 'Similarity Score' if color_by == 'P-value': slider_title = '-log10(p)' + if color_by == 'Hotelling P-value (phased)': + slider_title = '-log10(p)' if color_by == 'R^2': slider_title = 'R-squared' if color_by == 'Test Name': @@ -442,16 +425,18 @@ def create_network_plot(net, vlayers, elayers, color_by = 'Score', layer_by = 'S layer_title = layer_by if layer_by == 'P-value': layer_title = '-log10(p)' + if layer_by == 'Hotelling P-value (phased)': + layer_title = '-log10(p)' (colorway, sliders, nticks) = create_binned_network_figure(fig1, elayers, color_by, layer_by, slider_title, include_slider) fig1.update_layout( - height=600, - title=dict(text = "3D Network View", font = dict(color='#FFFFFF')), + autosize=True, + #title=dict(text = "3D Network View", font = dict(color='#FFFFFF')), showlegend=True, legend=dict(font = dict(color="#FFFFFF")), - margin=dict(l=10, r=10, t=30, b=10), + margin=dict(l=450, r=10, t=10, b=10), paper_bgcolor="#000000", colorway=colorway, scene=dict( @@ -468,7 +453,6 @@ def create_network_plot(net, vlayers, elayers, color_by = 'Score', layer_by = 'S x=0, y=0.1, xanchor='left', yanchor='bottom', font=dict(size=14)) ], sliders=sliders, - autosize=True, ) # We want an orthographic layout so that when looking above the edges line up @@ -501,6 +485,8 @@ def create_binned_network_figure(figure, elayers, color_by = 'Score', color_col = 'EBin' if color_by == 'P-value': color_col = 'PBin' + if color_by == 'Hotelling P-value (phased)': + color_col = 'HPBin' if color_by == 'R^2': color_col = 'RBin' if color_by == 'Test Name': @@ -508,11 +494,13 @@ def create_binned_network_figure(figure, elayers, color_by = 'Score', if color_by == 'Relationship': color_col = 'Rel' - layer_col = 'Edge_Bin' + layer_col = 'EBin' if layer_by == 'Score': layer_col = 'EBin' if layer_by == 'P-value': layer_col = 'PBin' + if layer_by == 'Hotelling P-value (phased)': + layer_col = 'HPBin' if layer_by == 'R^2': layer_col = 'RBin' if layer_by == 'Test Name': @@ -578,7 +566,7 @@ def create_binned_network_figure(figure, elayers, color_by = 'Score', sliders = [dict( active=0, currentvalue={"prefix": slider_title + ": "}, - pad={"t": 50}, + pad={"b": 50}, steps=steps, font=dict(color = '#FFFFFF'), tickcolor='#FFFFFF', @@ -595,6 +583,60 @@ def create_binned_network_figure(figure, elayers, color_by = 'Score', + +def create_degree_distribution_plot(vlayers): + """ + Creates a 2D scatterplot containing the degree distribution + """ + vdata = vlayers.loc[:,('Vertex', 'Degree')].drop_duplicates() + vdata = vdata.groupby('Degree').agg(['count']).reset_index() + + fig = go.Figure(data=[go.Scatter( + x=vdata['Degree'], + y=vdata['Vertex']['count'], + mode='markers', + marker=dict(symbol='circle', size=5, color='#000088'))]) + fig.update_layout( + height=350, + title="Node Degree Distribution", + margin=dict(l=10, r=10, t=80, b=20), + xaxis_type="log", + yaxis_type="log", + xaxis_title="Degree", + yaxis_title="Number of Nodes", + ) + return fig + + + + + +def create_avg_cc_distribution_plot(vlayers): + """ + Creates a 2D scatterplot containing the average clustering coefficient distribution + """ + vdata = vlayers.loc[:,('CC', 'Degree')].drop_duplicates() + vdata = vdata.groupby('Degree').agg(['mean']).reset_index() + + fig = go.Figure(data=[go.Scatter( + x=vdata['Degree'], + y=vdata['CC']['mean'], + mode='markers', + marker=dict(symbol='circle', size=5, color='#000088'))]) + fig.update_layout( + height=350, + title="Avg. Clusering Coefficient Distribution", + margin=dict(l=10, r=10, t=80, b=10), + xaxis_type="log", + yaxis_type="log", + xaxis_title="Degree", + yaxis_title="Number of Nodes", + ) + return fig + + + + def create_expression_scatterplot(gem, amx, elayers, color_col=None, edge_index = None): """ Uses Plotly to create the interactive 3D scatterplot of co-expression @@ -632,8 +674,8 @@ def create_expression_scatterplot(gem, amx, elayers, color_col=None, edge_index # Calculate the sizes of the points. sizes = pd.Series(list(samples)) - sizes = sizes.replace(to_replace=r'[^1]', value='8', regex=True) - sizes = sizes.replace({'1': '16'}) + sizes = sizes.replace(to_replace=r'[^1]', value='5', regex=True) + sizes = sizes.replace({'1': '10'}) sizes = sizes.astype('int') sizes.index = sdata.index @@ -701,11 +743,11 @@ def create_expression_scatterplot(gem, amx, elayers, color_col=None, edge_index text= sdata.index, hoverinfo='text')]) fig2.update_layout( - height=600, - title="3D Edge Co-Expression Scatterplot", + height=400, + title="", showlegend=showlegend, legend={'itemsizing': 'constant'}, - margin=dict(l=10, r=10, t=30, b=10), + margin=dict(l=10, r=10, t=0, b=10), scene=dict( aspectmode="cube", xaxis=dict(showbackground=True, showline=True, zeroline=True, showgrid=True, @@ -735,6 +777,60 @@ def create_expression_scatterplot(gem, amx, elayers, color_col=None, edge_index + +def create_network_stats_table(net): + """ + Construts the HTML table that holds information about the network. + + net : the network data frame. + """ + htr_style = {} + htd_style = { + 'text-align' : 'left', 'padding' : '5px', + 'margin': '0px', 'padding' : '0 0 0 20', + 'width' : '60%', "border-bottom": "1px solid #BBBBBB"} + td_style = { + 'text-align' : 'left', 'padding' : '5px', + 'margin': '0px', 'padding' : '0 0 0 20', "border-bottom": "1px solid #BBBBBB" + } + + div_children = [] + table_rows = [] + + num_edges = net.shape[0] + unique_edges = net.loc[:,('Source', 'Target')].drop_duplicates().shape[0] + num_nodes = len(pd.concat([net['Source'], net['Target']]).unique()) + + div_children.append( + html.Table( + style = { + "background-color" : 'white', 'color' : 'black', + 'margin-top' : '0px', 'width' : '100%', + 'margin-bottom' : '0px' + }, + children=[ + html.Tr([ + html.Th('Total Edges', style=htd_style), + html.Td(num_edges, style=td_style) + ]), + html.Tr([ + html.Th('Unique Edges', style=htd_style), + html.Td(unique_edges, style=td_style) + ]), + html.Tr([ + html.Th('Number of Nodes', style=htd_style), + html.Td(num_nodes, style=td_style) + ]) + ] + ) + ) + + return html.Div( + id='network-stats-table', + children = div_children, + ) + + def create_dash_edge_table(net, edge_index = None): """ Constructs the HTML table that holds edge information for the Dash appself. @@ -747,30 +843,68 @@ def create_dash_edge_table(net, edge_index = None): returns : a Dash html.Table object. """ - htr_style = {'background-color' : '#f2f2f2'} - hth_style = {'text-align' : 'left', - 'padding' : '5px', - 'border-bottom' : '1px solid #ddd'}; - th_style = {'text-align' : 'left', - 'padding' : '5px', - 'border-bottom' : '1px solid #ddd'}; + htr_style = {} + htd_style = { + 'text-align' : 'left', 'padding' : '5px', + 'margin': '0px', 'padding' : '0 0 0 20', + 'width' : '30%', "border-bottom": "1px solid #BBBBBB"} + td_style = { + 'text-align' : 'left', 'padding' : '5px', + 'margin': '0px', 'padding' : '0 0 0 20', "border-bottom": "1px solid #BBBBBB" + } net_fixed = net.drop(['Samples', 'Edge_Bin', 'Pval_Bin', 'Rsqr_Bin', 'Relationship'], axis=1) - if ('p_value' in net_fixed.columns): - net_fixed['p_value'] = net_fixed['p_value'].apply(np.format_float_scientific, precision=4) + if ('HPval_Bin' in net_fixed.columns): + net_fixed = net_fixed.drop(['HPval_Bin'], axis=1) + for colname in net_fixed.columns: + if ('p_value' in colname): + net_fixed[colname] = net_fixed[colname].apply(np.format_float_scientific, precision=4) columns = net_fixed.columns - table_rows = [] - table_rows.append(html.Tr([html.Th(col, style=hth_style) for col in columns], style=htr_style)) + div_children = [] if not edge_index == None: row_vals = net_fixed.iloc[edge_index] source = row_vals['Source'] target = row_vals['Target'] + div_children.append(html.Label( + '{source} (co) {target}'.format(source = source, target=target), + style = {'padding' : '0px', 'margin' : '0px'} + )) + div_children.append(html.Br()) row_vals = net_fixed[(net_fixed['Source'] == source) & (net_fixed['Target'] == target)] for index, row in row_vals.iterrows(): - table_rows.append(html.Tr([html.Th(row[col], style=th_style) for col in columns])) + table_rows = [] + for col in columns: + if col == "Source" or col == "Target": + continue + + table_rows.append( + html.Tr([ + html.Th(col, style=htd_style), + html.Td(row[col], style=td_style) + ]) + ) + div_children.append( + html.Label('Edge #{index}'.format(index = index))) + div_children.append( + html.Table( + style = { + "background-color" : 'white', 'color' : 'black', + 'margin-top' : '10px', 'margin-bottom' : '0px', + 'width' : '100%', + }, + children=table_rows + ) + ) + else: + div_children.append( + html.Div('To view edge details, click an edge in the network.') + ) - return html.Table(children=table_rows) + return html.Div( + id='edge-table', + children = div_children, + ) @@ -787,28 +921,58 @@ def create_dash_sample_table(net, amx, sample = None): returns : a Dash html.Table object. """ - htr_style = {'background-color' : '#f2f2f2'} - hth_style = {'text-align' : 'left', - 'padding' : '5px', - 'border-bottom' : '1px solid #ddd'}; - th_style = {'text-align' : 'left', - 'padding' : '5px', - 'border-bottom' : '1px solid #ddd'}; + htr_style = {} + htd_style = { + 'text-align' : 'left', 'padding' : '5px', + 'margin': '0px', 'padding' : '0 0 0 20', + 'width' : '30%', "border-bottom": "1px solid #BBBBBB"} + td_style = { + 'text-align' : 'left', 'padding' : '5px', + 'margin': '0px', 'padding' : '0 0 0 20', "border-bottom": "1px solid #BBBBBB" + } columns = amx.columns - table_rows = [] - table_rows.append(html.Tr([html.Th(col, style=hth_style) for col in columns], style=htr_style)) + div_children = [] if sample: - row_vals = amx.loc[sample] - table_rows.append(html.Tr([html.Th(row_vals[col], style=th_style) for col in columns])) + div_children.append(html.H4( + children = ['Sample: {sample}'.format(sample = sample)], + style = {'padding' : '0px', 'margin' : '0px'} + )) + table_rows = [] + row = amx.loc[sample] + for col in columns: + table_rows.append( + html.Tr([ + html.Th(col, style=htd_style), + html.Td(row[col], style=td_style) + ]) + ) + + div_children.append( + html.Table( + style = { + "background-color" : 'white', 'color' : 'black', + 'margin-top' : '10px', + 'margin-bottom' : '10px', 'width' : '100%', + }, + children=table_rows + ) + ) + else: + div_children.append( + html.Div('To view sample details, click an edge in the network, then in the edge scatterplot click a sample.') + ) - return html.Table(children=table_rows) + return html.Div( + id='sample-table', + children = div_children + ) -def create_dash_node_table(net, nmeta, nodes = None): +def create_dash_node_table(net, nmeta, vlayers, node = None): """ Constructs the HTML table that holds node information for the Dash app. @@ -816,31 +980,87 @@ def create_dash_node_table(net, nmeta, nodes = None): nmeta : The dataframe containing the node metadata. + vlayers : The dataframe containing the 3D coordinates for the nodes. + node : The name of the node to display returns : a Dash html.Table object. """ - htr_style = {'background-color' : '#f2f2f2'} - hth_style = {'text-align' : 'left', - 'padding' : '5px', - 'border-bottom' : '1px solid #ddd'}; - th_style = {'text-align' : 'left', - 'padding' : '5px', - 'border-bottom' : '1px solid #ddd'}; - - - if nmeta: - columns = nmeta.columns - table_rows = [] - table_rows.append(html.Tr([html.Th(col, style=hth_style) for col in columns], style=htr_style)) - if (not nmeta is None) & (not nodes is None): - row_vals = nmeta.loc[nodes] - for index, row in row_vals.iterrows(): - table_rows.append(html.Tr([html.Th(row[col], style=th_style) for col in columns])) + htr_style = {} + htd_style = { + 'text-align' : 'left', 'padding' : '5px', + 'margin': '0px', 'padding' : '0 0 0 20', + 'width' : '30%', "border-bottom": "1px solid #BBBBBB"} + td_style = { + 'text-align' : 'left', 'padding' : '5px', + 'margin': '0px', 'padding' : '0 0 0 20', "border-bottom": "1px solid #BBBBBB" + } + + div_children = [] + table_rows = [] + if not node is None: + table_rows.append( + html.Tr([ + html.Th('Name', style=htd_style), + html.Td(node, style=td_style) + ]) + ) + table_rows.append( + html.Tr([ + html.Th('Degree', style=htd_style), + html.Td(vlayers.loc[vlayers['Vertex'] == node, 'Degree'].unique(), style=td_style) + ]) + ) + if not nmeta is None: + columns = nmeta.columns + if not nmeta is None: + rows = nmeta.loc[node] + for index, row in rows.iterrows(): + table_rows.append( + html.Tr([ + html.Th( + colSpan = 2, + children=[ + html.Label( + "{term}".format(term = row['Term']), + style= {'font-weight' : 'bold'} + ), + html.Div( + row['Definition'], + style= {'font-weight' : 'normal'}) + ], + style=htd_style, + ) + ]) + ) + else: + div_children.append( + html.Div('There is no additional information about this node.') + ) + else: + div_children.append( + html.Div('There are no node meta data provided. Use the --nmeta option to load node data when running this application.') + ) + div_children.append( + html.Table( + style = { + "background-color" : 'white', 'color' : 'black', + 'margin-top' : '0px', + 'margin-bottom' : '0px', 'width' : '100%', + }, + children=table_rows + ) + ) + else: + div_children.append( + html.Div('To view node details, click a node in the network.') + ) - return html.Table(children=table_rows) - return '' + return html.Div( + id='node-table', + children = div_children + ) @@ -879,10 +1099,12 @@ def create_condition_select(amx, sample_col = 'Cluster'): # Build the select element. select = dcc.Dropdown( id = 'coexp-condition-select', + style = {'color' : 'black'}, options = [ {'label' : col, 'value' : col} for col in keep ], - value = 'Cluster') + value = 'Cluster' + ) return select @@ -904,6 +1126,8 @@ def create_edge_color_select(net): options = ['Score'] if 'p_value' in net.columns: options.append('P-value') + if 'hotelling_p_value' in net.columns: + options.append('Hotelling P-value (phased)') if 'Test_Name' in net.columns: options.append('Test Name') if 'r_squared' in net.columns: @@ -912,6 +1136,9 @@ def create_edge_color_select(net): select = dcc.Dropdown( id = 'edge-color-select', + style = { + 'color' : 'black' + }, options = [ {'label' : col, 'value' : col} for col in options ], @@ -939,6 +1166,8 @@ def create_edge_layer_select(net): options = ['Score'] if 'p_value' in net.columns: options.append('P-value') + if 'hotelling_p_value' in net.columns: + options.append('Hotelling P-value (phased)') if 'Test_Name' in net.columns: options.append('Test Name') if 'r_squared' in net.columns: @@ -947,6 +1176,9 @@ def create_edge_layer_select(net): select = dcc.Dropdown( id = 'edge-layer-select', + style = { + 'color' : 'black' + }, options = [ {'label' : col, 'value' : col} for col in options ], @@ -956,6 +1188,58 @@ def create_edge_layer_select(net): + + +def build_sidebar_box_header(title, id_prefix): + + return html.Div( + style = { + 'background-color' : '#555555', 'color' : 'white', + 'margin': '0px', 'padding':'10px', + "border-radius": "5px"}, + children = [ + html.H3( + children = [title], + style = { + 'float' : 'left', + 'padding' : '0px', 'margin' : '0px 0px 0px 0' + } + ), + html.Button( + 'toggle', + id="{prefix}-toggle".format(prefix=id_prefix), + n_clicks=0, + # src="https://img.icons8.com/officexs/32/000000/open-view.png", + style={ + "height" : "20px", "float" : "right", + 'padding' : '0px', 'margin' : '0px 0px 0px 0' + } + ), + html.Div(style ={'clear' : 'both'}) + ] + ) + + + + + +def write_to_data_uri(s): + """ + Writes to a uri. + Use this function to embed javascript into the dash app. + Adapted from the suggestion by user 'mccalluc' found here: + https://community.plotly.com/t/problem-of-linking-local-javascript-file/6955/2 + """ + uri = ( + ('data:;base64,').encode('utf8') + + base64.urlsafe_b64encode(s.encode('utf8')) + ).decode("utf-8", "strict") + return uri + + + + + def build_application(net, gem, amx, nmeta, vlayers, elayers, sample_col, net_name): @@ -985,113 +1269,256 @@ def build_application(net, gem, amx, nmeta, vlayers, elayers, sample_col, return : The Dash application object. """ - app = dash.Dash() - app.layout = html.Div([ - # Header Row - html.Div(className='row', id = "header", children=[ - html.Img( - src="https://raw.githubusercontent.com/SystemsGenetics/KINC/master/docs/images/kinc.png", - style={"height" : "55px","display" : "inline-block", - "padding" : "0px", "margin" : "0px 10px 0px 10px"}), - html.H1(children="3D Network Explorer", + + sidebar_box_style = { + "float" : "left", "width" : "100%", "color" : "black", + "padding" : "0px", "margin-bottom" : "10px", + "background-color" : "#CCCCCC", + "border-radius": "5px" + } + + internal_js = write_to_data_uri(""" + """) + + external_scripts = [ + 'https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js', + internal_js, + ] + external_stylesheets = [ + ] + app = dash.Dash(__name__, + external_scripts=external_scripts, + external_stylesheets=external_stylesheets + ) + app.scripts.config.serve_locally = False + app.layout = html.Div( + style = { + "padding" : "0px", "background-color" : + "black", "margin" : "0px", "color" : "white", + "width" : "100%", "height" : "100vh" + }, + children = [ + # Graph Row + html.Div( + style = { + "border" : "0px solid white", "padding" : "15px", + "background-color" : "black", "margin" : "0px", + }, + children=[ + dcc.Graph( + id = 'network-3dview', + style = { + "height" : "100vh" + }, + figure = create_network_plot(net, vlayers, elayers), + config = { + 'toImageButtonOptions' : { + 'filename': 'kinc_3d_network_view', + 'width': 800, + 'height': 600, + 'format': 'svg', + 'scale' : 2 + } + } + ), + dcc.Input( + id='current-network-3dview-camera', + type="number", + value=0, + style= {'display' : 'none'} + ), + dcc.Input( + id='current-network-3dview-aspect', + type="number", + value=0, + style= {'display' : 'none'} + ) + ] + ), + # Header Row + html.Div( + id = "header", style={ - "display" : "inline-block", "padding" : "10px 0px 0px 0px", - "margin" : "0px", "vertical-align" : "top"}), - html.Div(children="Network name: " + net_name, - style={"padding" : "0px 0px 0px 10px"}), - ]), - # Graph Row - html.Div(children=[ - # 3D network plot - html.Div(children=[ - html.Div(id='edge-select-box', children=[ - html.Label('Color', style={'color': '#FFFFFF'}), - create_edge_color_select(net), - html.Label('Layer', style={'color': '#FFFFFF'}), - create_edge_layer_select(net)], - style={'padding-bottom' : '10px'}), - dcc.Graph(id = 'network-3dview', - figure = create_network_plot(net, vlayers, elayers), - config = { - 'toImageButtonOptions' : { - 'filename': 'kinc_3d_network_view', - 'width': 800, - 'height': 600, - 'format': 'svg', - 'scale' : 2 - } - }), - dcc.Input( - id='current-network-3dview-camera', type="number", - value=0, style= {'display' : 'none'}), - dcc.Input( - id='current-network-3dview-aspect', type="number", - value=0, style= {'display' : 'none'})], - style = {"width" : "55%", "display" : "inline-block", - "border" : "1px solid black", "padding" : "10px", - "background-color" : "black", "margin" : "10px"}, + "position" : "fixed", "left" : "30px", "top" : "20px", + 'padding' : '0px', "margin" : "0px", + }, + children=[ + html.Img( + src="https://raw.githubusercontent.com/SystemsGenetics/KINC/master/docs/images/kinc.png", + style={ + "height" : "55px","display" : "inline-block", + "padding" : "0px", "margin" : "0px 10px 0px 10px"}), + html.H1(children="3D Network Explorer", + style={ + "display" : "inline-block", "padding" : "10px 0px 0px 0px", + "margin" : "0px", "vertical-align" : "top"}), + html.Div(children="Network name: " + net_name, + style={"padding" : "0px 0px 0px 10px"}), + ] ), - # 3D Co-Expression scatterplot. - html.Div(children=[ - html.Div(id='coexp-condition-select-box', children=[ - create_condition_select(amx, sample_col)], - style={'padding-bottom' : '10px'}), - dcc.Graph(id = 'edge-expression-3dview', - figure = create_expression_scatterplot(gem, amx, elayers), - config = { - 'toImageButtonOptions' : { - 'filename': 'kinc_3d_expression_scatterplot', - 'width': 800, - 'height': 600, - 'format': 'svg', - 'scale' : 1 - } - }), - dcc.Input( - id='selected-edge', type="number", - value=-1, style= {'display' : 'none'}), - dcc.Input( - id='current-expr-camera-coords', type="number", - value=0, style= {'display' : 'none'})], - style = {"width" : "35%", "display" : "inline-block", - "border" : "1px solid black", "padding" : "10px", - "margin" : "10px", "vertical-align" : "top"}, + # Left Sidebar + html.Div( + style={ + "position" : "fixed", "left" : "30px", "top" : "120px", + 'padding' : '0px 10px 0px 0px', "margin" : "0px", + "width" : "400px", "height" : "80vh", 'overflow-y': 'auto', + "scrollbar-color" : "dark" + }, + children = [ + # Edge Color and Layer selection boxes. + html.Div( + id='edge-select-box', + style=sidebar_box_style, + children=[ + build_sidebar_box_header("Layout and Colors", 'edge-select-box'), + html.Div( + id='edge-select-box-contents', + style={'margin' : '0px', 'display' : 'none', 'padding' : '10px'}, + children = [ + html.Label('Color Edges By'), + create_edge_color_select(net), + html.Label('Layer Edges By'), + create_edge_layer_select(net) + ] + ) + ] + ), + # Node Details + html.Div( + style=sidebar_box_style, + children=[ + build_sidebar_box_header("Node Details", 'node-table-box'), + html.Div( + id="node-table-box-contents", + style={'margin' : '0px', 'visibility' : 'hidden'}, + children=[create_dash_node_table(net, nmeta, vlayers)] + ), + ] + ), + # Edge Table + html.Div( + style=sidebar_box_style, + children=[ + build_sidebar_box_header("Edge Details", 'edge-table-box'), + html.Div( + id="edge-table-box-contents", + style={'margin' : '0px', 'visibility' : 'hidden'}, + children=[create_dash_edge_table(net)] + ), + ] + ), + # 3D Co-Expression scatterplot row + html.Div( + style=sidebar_box_style, + children=[ + build_sidebar_box_header("Edge Scatterplot", 'scatterplot-box'), + html.Div( + id='scatterplot-box-contents', + style={'margin' : '0px', 'display' : 'none'}, + children = [ + html.Div( + style={'padding-bottom' : '10px'}, + children=[ + html.Label('Color Samples By'), + create_condition_select(amx, sample_col) + ], + ), + dcc.Graph( + id = 'edge-expression-3dview', + figure = create_expression_scatterplot(gem, amx, elayers), + config = { + 'toImageButtonOptions' : { + 'filename': 'kinc_3d_expression_scatterplot', + 'width': 800, + 'height': 600, + 'format': 'svg', + 'scale' : 1 + } + }, + ), + ] + ) + ] + ), + # Sample Details + html.Div( + style=sidebar_box_style, + children=[ + build_sidebar_box_header("Sample Details", 'sample-table-box'), + html.Div( + id="sample-table-box-contents", + style={'margin' : '0px', 'visibility' : 'hidden'}, + children=[create_dash_sample_table(net, amx)] + ), + ] + ), + # network stats + html.Div( + style=sidebar_box_style, + children=[ + build_sidebar_box_header("Network Stats", 'network-stats-box'), + html.Div( + id='network-stats-box-contents', + style={'margin' : '0px', 'padding' : '10px'}, + children = [ + create_network_stats_table(net), + dcc.Graph( + id = 'degree-distribution-plot', + figure = create_degree_distribution_plot(vlayers), + config = { + 'toImageButtonOptions' : { + 'filename': 'kinc_3d_degree_distribution', + 'width': 800, + 'height': 800, + 'format': 'svg', + 'scale' : 1 + } + }, + ), + dcc.Graph( + id = 'avg-cc-distribution-plot', + figure = create_avg_cc_distribution_plot(vlayers), + config = { + 'toImageButtonOptions' : { + 'filename': 'kinc_3d_average_cc_distribution', + 'width': 800, + 'height': 600, + 'format': 'svg', + 'scale' : 1 + } + }, + ), + ] + ) + ] + ), + ], ), - ]), - # Table Row - html.Div(className='row', children=[ - html.H3(children="Edge Details", style={ - "display" : "inline-block", "padding" : "10px 0px 0px 0px", - "margin" : "0px", "vertical-align" : "top"}), - html.Div(id = "edge-table", children=[ - create_dash_edge_table(net)]), - html.H3(children="Node Detail", style={ - "display" : "inline-block", "padding" : "10px 0px 0px 0px", - "margin" : "0px", "vertical-align" : "top"}), - html.Div(id = "node-table", children=[ - create_dash_node_table(net, nmeta)]), - html.H3(children="Sample Detail", style={ - "display" : "inline-block", "padding" : "10px 0px 0px 0px", - "margin" : "0px", "vertical-align" : "top"}), - html.Div(id = "sample-table", children=[ - create_dash_sample_table(net, amx)]), - # Holds the values of the clicked node or cliced edge nodes. dcc.Input( - id='selected-nodes', type="text", - value=0, style= {'display' : 'none'}), - ]), - ]) + id='current-expr-camera-coords', + type="number", + value=0, + style= {'display' : 'none'} + ) + ] # End app layout children + ) # End app layout # Callback when an object in the network plot is clicked. @app.callback( - dash.dependencies.Output('selected-edge', 'value'), - [dash.dependencies.Input('network-3dview', 'clickData')]) - def set_current_edge(clickData): + [dash.dependencies.Output('edge-expression-3dview', 'figure'), + dash.dependencies.Output('edge-table', 'children'), + dash.dependencies.Output('node-table', 'children')], + [dash.dependencies.Input('network-3dview', 'clickData'), + dash.dependencies.Input('coexp-condition-select', 'value')], + [dash.dependencies.State('edge-expression-3dview', 'figure')]) + def set_current_edge(clickData, color_col, figure): edge_index = None - edge_nodes = None - nodes = None + node = None if (clickData): + scatterplot = figure + node_table = None + edge_table = None points = clickData['points'] efound = re.match('^Edge: (.*?) \(co\) (.*?)$', points[0]['text']) nfound = re.match('^Node: (.*?)$', points[0]['text']) @@ -1101,36 +1528,29 @@ def set_current_edge(clickData): source = row_vals['Source'] target = row_vals['Target'] edge_nodes = [source, target] - return edge_index #, json.dumps(edge_nodes)) - # elif (nfound): - # return (-1, json.dumps([nfound.group(1)])) - raise dash.exceptions.PreventUpdate + scatterplot = create_expression_scatterplot(gem, amx, elayers, color_col, edge_index) + edge_table = create_dash_edge_table(net, edge_index) + node_table = create_dash_node_table(net, nmeta, vlayers, None) + if (nfound): + node = edge_index = points[0]['customdata'] + node_table = create_dash_node_table(net, nmeta, vlayers, node) + edge_table = create_dash_edge_table(net, None) + return [scatterplot, edge_table, node_table] + + + raise dash.exceptions.PreventUpdate - @app.callback( - dash.dependencies.Output('node-table', 'children'), - [dash.dependencies.Input('selected-nodes', 'value')]) - def update_node_table(nodes): - if nodes == 0: - raise dash.exceptions.PreventUpdate - return create_dash_node_table(net, nmeta, json.loads(nodes)) @app.callback( - dash.dependencies.Output('sample-table', 'children'), + [dash.dependencies.Output('sample-table', 'children')], [dash.dependencies.Input('edge-expression-3dview', 'clickData')]) def update_sample_table(clickData): if (clickData): sample = clickData['points'][0]['text'] - return create_dash_sample_table(net, amx, sample) + return [create_dash_sample_table(net, amx, sample)] raise dash.exceptions.PreventUpdate - @app.callback( - dash.dependencies.Output('edge-table', 'children'), - [dash.dependencies.Input('selected-edge', 'value')]) - def update_edge_table(current_edge): - if (current_edge == -1): - current_edge = None - return create_dash_edge_table(net, current_edge) @app.callback( dash.dependencies.Output('current-network-3dview-camera', 'value'), @@ -1152,23 +1572,14 @@ def set_network_aspect(relayoutData): return aspect raise dash.exceptions.PreventUpdate - # Callback to update the co-expression plot when the edge changes. - @app.callback( - dash.dependencies.Output('edge-expression-3dview', 'figure'), - [dash.dependencies.Input('selected-edge', 'value'), - dash.dependencies.Input('coexp-condition-select', 'value')]) - def update_expression_plot(current_edge, color_col): - if (current_edge == -1): - current_edge = None - return create_expression_scatterplot(gem, amx, elayers, color_col, current_edge) - @app.callback( dash.dependencies.Output('network-3dview', 'figure'), [dash.dependencies.Input('edge-color-select', 'value'), dash.dependencies.Input('edge-layer-select', 'value')], [dash.dependencies.State('current-network-3dview-camera', 'value'), - dash.dependencies.State('current-network-3dview-aspect', 'value')]) + dash.dependencies.State('current-network-3dview-aspect', 'value')] + ) def update_network_plot(color_by, layer_by, camera_vals, aspect_vals): camera = None aspect = None @@ -1183,10 +1594,93 @@ def update_network_plot(color_by, layer_by, camera_vals, aspect_vals): return create_network_plot(net, vlayers, elayers, color_by, layer_by, camera, aspect) + @app.callback( + dash.dependencies.Output('edge-select-box-contents', 'style'), + [dash.dependencies.Input('edge-select-box-toggle', 'n_clicks')] + ) + def toggle_edge_select_box(toggle): + if (toggle % 2 == 1): + return {'margin' : '0px', 'visibility' : 'visible', 'padding' : '10px'} + else: + return {'margin' : '0px', 'visibility' : 'hidden', 'height' : '0px', 'padding' : '0px'} + + + @app.callback( + dash.dependencies.Output('scatterplot-box-contents', 'style'), + [dash.dependencies.Input('scatterplot-box-toggle', 'n_clicks')] + ) + def toggle_scatterplot_box(toggle): + if (toggle % 2 == 1): + return {'margin' : '0px', 'visibility' : 'visible', 'max-height' : '500px', 'padding' : '10px'} + else: + return {'margin' : '0px', 'visibility' : 'hidden', 'height' : '0px', 'padding' : '0px'} + + + @app.callback( + dash.dependencies.Output('sample-table-box-contents', 'style'), + [dash.dependencies.Input('sample-table-box-toggle', 'n_clicks')] + ) + def toggle_sample_table_box(toggle): + if (toggle % 2 == 1): + return { + 'margin' : '0px', 'visibility' : 'visible', + 'max-height' : '250px', 'padding' : '10px', + 'overflow-y': 'auto', + } + else: + return {'margin' : '0px', 'visibility' : 'hidden', 'height' : '0px', 'padding' : '0px'} + + @app.callback( + dash.dependencies.Output('network-stats-box-contents', 'style'), + [dash.dependencies.Input('network-stats-box-toggle', 'n_clicks')] + ) + def toggle_network_stats_box(toggle): + if (toggle % 2 == 0): + return {'margin' : '0px', 'visibility' : 'visible', 'padding' : '10px'} + else: + return {'margin' : '0px', 'visibility' : 'hidden', 'height' : '0px', 'padding' : '0px'} + + @app.callback( + dash.dependencies.Output('node-table-box-contents', 'style'), + [dash.dependencies.Input('node-table-box-toggle', 'n_clicks')] + ) + def toggle_node_table_box(toggle): + if (toggle % 2 == 1): + return { + 'margin' : '0px', 'visibility' : 'visible', + 'max-height' : '250px', 'padding' : '10px', + 'overflow-y': 'auto', + } + else: + return {'margin' : '0px', 'visibility' : 'hidden', 'height' : '0px', 'padding' : '0px'} + + + @app.callback( + dash.dependencies.Output('edge-table-box-contents', 'style'), + [dash.dependencies.Input('edge-table-box-toggle', 'n_clicks')] + ) + def toggle_edge_table_box(toggle): + if (toggle % 2 == 1): + return { + 'margin' : '0px', 'visibility' : 'visible', + 'max-height' : '250px', 'padding' : '10px', + 'overflow-y': 'auto', + } + else: + return {'margin' : '0px', 'visibility' : 'hidden', 'height' : '0px', 'padding' : '0px'} return app +def is_port_in_use(port): + """ + Checks if a port is already in use. + + port: the desired port to use + """ + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + return s.connect_ex(('localhost', port)) == 0 + def main(): """ @@ -1195,14 +1689,14 @@ def main(): """ parser = argparse.ArgumentParser() - parser.add_argument('--net', dest='net_path', type=str, required=True) - parser.add_argument('--emx', dest='gem_path', type=str, required=True) - parser.add_argument('--amx', dest='amx_path', type=str, required=True) - parser.add_argument('--sample_col', dest='sample_col', type=str, required=False, default='Sample') - parser.add_argument('--nmeta', dest='nmeta', type=str, required=False) - parser.add_argument('--debug', dest='debug', action='store_true', default=False) - parser.add_argument('--redo-layout', dest='redo_layout', action='store_true', default=False) - parser.add_argument('--iterations', dest='iterations', type=int, default=100) + parser.add_argument('--net', dest='net_path', type=str, required=True, help="(required) The path to the KINC-derived network file") + parser.add_argument('--emx', dest='gem_path', type=str, required=True, help="(retuired) The path to the log2 transformed Gene Expression Matrix or Metabolite abundance matrix.") + parser.add_argument('--amx', dest='amx_path', type=str, required=True, help="(required) The path to the tab-delimited annotation matrix. The matrix must have at least one column that contains a unique list of sample names.") + parser.add_argument('--sample_col', dest='sample_col', type=str, required=False, default='Sample', help="(optional) The name of the column in the annotation matrix that contains the unique sample names. Defaults to 'Sample'") + parser.add_argument('--nmeta', dest='nmeta', type=str, required=False, help="(optional) The path to a tab-delimited node meta data file. The format of the file must have 4 columns, with the first containing the node name, the second a controlled vocabulary term ID, the third the term definition and the fourth the vocubulary name.") + parser.add_argument('--debug', dest='debug', action='store_true', default=False, help="(optional). Add this argument to enable Dash application debugging mode.") + parser.add_argument('--redo-layout', dest='redo_layout', action='store_true', default=False, help=" (optional). If the 2D and 3D network layout has already been constructed it will be loaded from a file. Add this arugment to force the layouts to be rebuilt and not loaded from the files. To prevent Dash from rerunning the layout on callbacks, this option results in the program terminating. To view the application, restart without this option.") + parser.add_argument('--iterations', dest='iterations', type=int, default=100, help="(optional). The number of iterations to perform when calculating the Force Atlas2 layout. This argument is only used the first time a network is viewed or if the --redo_layout argument is provided.") args = parser.parse_args() # Make sure the paths exist @@ -1249,13 +1743,18 @@ def main(): # If the user requested we rebuild the layout then terminate so Dash # doesn't try to rebuild the layout on each callback. if args.redo_layout: - print ("Layouts have been built. Please relaunch without the --redo-layouts option to view the app.") + print ("Layouts have been built. Please relaunch without the --redo-layout option to view the app.") exit(0) # Launch the dash application print("Launching application...") app = build_application(net, gem, amx, nmeta, vlayers, elayers, args.sample_col, net_prefix) - app.run_server(debug=args.debug) + + + port = 8050 + while(is_port_in_use(port)): + port = port + 1 + app.run_server(debug=args.debug, port=port) exit(0) diff --git a/bin/kinc-filter-bias.R b/bin/kinc-filter-bias.R index 72dee4e..67aca73 100755 --- a/bin/kinc-filter-bias.R +++ b/bin/kinc-filter-bias.R @@ -12,10 +12,10 @@ option_list = list( help="(optional). The number of computational threads to use for parallel processing. By default, all but 2 cores available on the local machine will be used.", metavar="numeric"), make_option(c("--wa_th"), type="numeric", default=1e-3, - help="(optional). The signficance threshold for performing the Welch Anova test. This test checks for differential expression of the GMM cluster vs non-cluster for each gene, thus, two tests are performed: one for each gene in the edge. Edges with both tests having P-values below this value are kept. The default is 1e-3.", + help="(optional). The p-value threshold for performing the Welch Anova test. This test checks for differential expression of the GMM cluster vs non-cluster for each gene, thus, two tests are performed: one for each gene in the edge. Edges with both tests having P-values below this value are kept. The default is 1e-3.", metavar="numeric"), make_option(c("--mtt_th"), type="numeric", default=0.1, - help="(optional). The signficance threshold for performing the paired T-test for missingness. This test checks for signicant difference in missingness between the two genes of an edge. This is important because one gene with a high level of missginess will bias the relationship if that missigness is condition-specific. Only edges whose genes have the same pattern of missingness should be considered. Those with p-values greater than the threshold are considered non-different. The default is 0.1.", metavar="numeric"), + help="(optional). The p-value threshold for performing the paired T-test for missingness. This test checks for signicant difference in missingness between the two genes of an edge. This is important because one gene with a high level of missginess will bias the relationship if that missigness is condition-specific. Only edges whose genes have the same pattern of missingness should be considered. Those with p-values greater than the threshold are considered non-different. The default is 0.1.", metavar="numeric"), make_option(c("--out_prefix"), type="character", default=NULL, help="(optional). The file name prefix used for the ouput files. If this arugment is not provided then the original network file name is used as a prefix.", metavar="character"), make_option(c("--suffix"), type="character", default=NULL, @@ -45,6 +45,10 @@ if (length(opt$suffix) > 0) { suffix = opt$suffix } +# Make sure KINC.R is at the correct vresion. +if (packageVersion("KINC.R") < "1.1") { + stop("This script requires KINC.R > 1.1") +} suppressMessages(library("KINC.R")) message("") diff --git a/bin/kinc-filter-phased.R b/bin/kinc-filter-phased.R new file mode 100755 index 0000000..cc8a66a --- /dev/null +++ b/bin/kinc-filter-phased.R @@ -0,0 +1,127 @@ +#!/usr/bin/env Rscript + +library("optparse") + +option_list = list( + make_option(c("--net"), type="character", default=NULL, + help="The path to the KINC-derived network file. It must be in tidy format.", + metavar="character"), + make_option(c("--emx"), type="character", default=NULL, + help="The path to the log2 transformed Gene Expression Matrix or Metabolite abundance matrix. This must the be same file used by KINC to create the network.", metavar="character"), + make_option(c("--amx"), type="character", default=NULL, + help="The path to the sample annotation matrix. This must the be same file used by KINC to perform the cond-test function.", metavar="character"), + make_option(c("--test_col"), type="character", default=NULL, + help="The name of the column in the sample annotation matrix that has the labels that should be tested for phased (or differential expression).", metavar="character"), + make_option(c("--sample_col"), type="character", default=NULL, + help="(optional). The name of the column in the sample annotation matrix that contains the sample names. The default is 'Sample'.", metavar="character"), + make_option(c("--min_cluster_size"), type="numeric", default=10, + help="(optional). The minimum number of samples, per label, that must exist for the label to be tested. The labels are the categories in the 'test_col' of the annotation matrix. ", metavar="numeric"), + make_option(c("--out_prefix"), type="character", default=NULL, + help="(optional). The file name prefix used for the ouput files. If this arugment is not provided then the original network file name is used as a prefix.", metavar="character"), + make_option(c("--threads"), type="numeric", default=0, + help="(optional). The number of computational threads to use for parallel processing. By default, all but 2 cores available on the local machine will be used.", + metavar="numeric"), + make_option(c("--th"), type="numeric", default=1e-3, + help="(optional). The p-value threshold for performing the Hotelling test. This test checks for differential expression within the GMM cluster vs for the categorical column specifed by 'test_col'. Edges having P-values below this value are kept. The default is 1e-3.", + metavar="numeric") +); + +opt_parser = OptionParser(option_list=option_list); +opt = parse_args(opt_parser); + +if (is.null(opt$net)){ + print_help(opt_parser) + stop("Please provide a network file (--net).", call.=FALSE) +} +if (is.null(opt$emx)){ + print_help(opt_parser) + stop("Please provide the expression matrix file (--emx).", call.=FALSE) +} +if (is.null(opt$amx)){ + print_help(opt_parser) + stop("Please provide the sample annotation matrix file (--amx).", call.=FALSE) +} +if (is.null(opt$test_col)){ + print_help(opt_parser) + stop("Please provide the column in the annotation matrix file that should be tested for differential expression (--test_col).", call.=FALSE) +} +if (is.null(opt$sample_col)){ + opt$sample_col = 'Sample' +} +if (is.null(opt$th) || is.na(opt$th)){ + opt$th = 1e-3 +} +if (is.null(opt$min_cluster_size) || is.na(opt$min_cluster_size)){ + opt$th = 10 +} + +net_prefix = basename(opt$net) +if (length(opt$out_prefix) > 0) { + net_prefix = opt$out_prefix +} + +# Make sure KINC.R is at the correct vresion. +if(packageVersion("KINC.R") < "1.2") { + stop("This script requires KINC.R > 1.2") +} +suppressMessages(library("KINC.R")) + +message("") +message(' __ __ ______ __ __ ____ ____ ') +message('/\\ \\/\\ \\ /\\__ _\\ /\\ \\/\\ \\/\\ _`\\ /\\ _`\\ ') +message('\\ \\ \\/\'/\'\\/_/\\ \\/ \\ \\ `\\\\ \\ \\ \\/\\_\\ \\ \\ \\L\\ \\ ') +message(' \\ \\ , < \\ \\ \\ \\ \\ , ` \\ \\ \\/_/_ \\ \\ , / ') +message(' \\ \\ \\\\`\\ \\_\\ \\__\\ \\ \\`\\ \\ \\ \\L\\ \\__\\ \\ \\\\ \\ ') +message(' \\ \\_\\ \\_\\ /\\_____\\\\ \\_\\ \\_\\ \\____/\\_\\\\ \\_\\ \\_\\ ') +message(' \\/_/\\/_/ \\/_____/ \\/_/\\/_/\\/___/\\/_/ \\/_/\\/ / ') +message("") +message("This script uses KINC.R, a companion R library for KINC") +message("https://github.com/SystemsGenetics/KINC.R") +message("-------------------------------------------------------") + + +# Load the expression matrix, annotation matrix and the network. +message("Loading the KINC network file...") +net = loadKINCNetwork(opt$net) + +message("Loading the expression matrix file...") +emx = loadGEM(opt$emx) + +message("Loading the sample annotation matrix file...") +amx = loadSampleAnnotations(opt$amx, opt$sample_col) + + +# Find out how many threads we should use. +threads = opt$threads +if (threads == 0) { + threads = detectCores() - 2 + if (threads < 1) { + threads = 1 + } +} + +# Filter biased edges +message("Filtering the network for phased edges...") +message(paste0(" Num threads: max allowed - 2")) +message(paste0(" Hotellings test threshold: ", opt$th)) +message(paste0(" Sample Column: ", opt$sample_col)) +message(paste0(" Test Column: ", opt$test_col)) +message(paste0(" Minimum Cluster Size: ", opt$min_cluster_size)) +message("") + +# Set this option so that the progress bar appears. +pbo = pboptions(type="txt") + +message("Filtering phased edges...") +net2 = performEdgeDiff(net, emx, amx, test_column = opt$test_col, + sample_col = opt$sample_col, min_cluster_size = opt$min_cluster_size, threads=threads) + +message("Saving phased network...") +phased_net = net2[which(net2['hotelling_p_value'] < opt$th),] +net_name = paste0(net_prefix, '.phased.csGCN.txt') +saveKINCNetwork(phased_net, net_name) + +message("Saving non-phased network...") +unphased_net = net2[which(net2['hotelling_p_value'] >= opt$th),] +net_name = paste0(net_prefix, '.non-phased.csGCN.txt') +saveKINCNetwork(unphased_net, net_name) diff --git a/bin/kinc-filter-rank.R b/bin/kinc-filter-rank.R index 9256aa9..d908730 100755 --- a/bin/kinc-filter-rank.R +++ b/bin/kinc-filter-rank.R @@ -10,7 +10,7 @@ option_list = list( help="(optional). The file name prefix used for the ouput files. If this arugment is not provided then the original network file name is used as a prefix.", metavar="character"), make_option(c("--top_n"), type="numeric", default=10000, - help="(optional). The number of edges to extract from the network file. This is the top n ranked edges per condition. If --no_conditions is provided then this is the top n ranked edges in the full network.", + help="(optional). Extract the top n ranked edges from the network file. This is the top n ranked edges per condition. If --no_conditions is provided then this is the top n ranked edges in the full network. Note: some edges may have the same rank, but only the first top_n will be returned.", metavar="numeric"), make_option(c("--no_condition_rankings"), type="logical", action="store_true", help="(optional). By deafult the rankings are calculated independently per each condition in the network. To calculate rankings for the entire network regardless of condition set this to TRUE.", @@ -52,8 +52,12 @@ if (is.null(opt$save_condition_networks)){ opt$save_condition_networks = FALSE } - +# Make sure KINC.R is at the correct vresion. +if(packageVersion("KINC.R") < "1.1") { + stop("This script requires KINC.R > 1.1") +} suppressMessages(library("KINC.R")) +suppressMessages(library("dplyr")) message("") message(' __ __ ______ __ __ ____ ____ ') @@ -67,31 +71,49 @@ message("") message("This script uses KINC.R, a companion R library for KINC") message("https://github.com/SystemsGenetics/KINC.R") message("-------------------------------------------------------") -message("Loading the KINC network file...") -net = loadKINCNetwork(opt$net) # Get the edge ranks. -ranks = c() -if (opt$no_condition_rankings) { - message("Calculating network-wide ranks...") - ranks = getEdgeRanks(net, by_condition=FALSE, opt$score_weight, opt$pval_weight, opt$rsqr_weight) +rdata = paste0(opt$out_prefix, '-th_ranked.RData') +if (file.exists(rdata)) { + message("Reloading previously calculated ranks...") + load(rdata) } else { - message("Calculating condition-specific ranks...") - ranks = getEdgeRanks(net, by_condition=TRUE, opt$score_weight, opt$pval_weight, opt$rsqr_weight) + message("Loading the KINC network file...") + + net = loadKINCNetwork(opt$net) + ranks = c() + if (opt$no_condition_rankings) { + message("Calculating network-wide ranks...") + ranks = getEdgeRanks(net, by_condition=FALSE, opt$score_weight, opt$pval_weight, opt$rsqr_weight) + } else { + message("Calculating condition-specific ranks...") + ranks = getEdgeRanks(net, by_condition=TRUE, opt$score_weight, opt$pval_weight, opt$rsqr_weight) + } + ranks = as.numeric(ranks) + net$rank = ranks + save(net, file=rdata) } -net$rank = ranks + # Filter the network to include the top n ranked edges. -netF = net[(which(ranks < opt$top_n)),] -netF = netF[order(netF$rank),] +if (opt$no_condition_rankings) { + message(paste("Filtering by top,", opt$top_n, ", network edges...")) + net = net[order(net$rank),] + netF = net[1:opt$top_n,] +} else { + message(paste("Filtering by top,", opt$top_n, ", conditional edges...")) + net = net[order(net$Test_Name, net$rank),] + netF = net %>% group_by(Test_Name) %>% slice_head(n=opt$top_n) %>% as.data.frame() +} + if (opt$save_condition_networks) { - message("Saving filtered condition-specific networks...") + message("Writing filtered condition-specific network files...") net_name = paste0(opt$out_prefix, '-th_ranked.%condition%-%filter%.csGCN.txt') saveConditionKINCNetwork(netF, net_name, filter=opt$unique_filter) } else { - message("Saving filtered network...") + message("Writing filtered network file...") net_name = paste0(opt$out_prefix, '-th_ranked.csGCN.txt') saveKINCNetwork(netF, net_name) } diff --git a/bin/kinc-make-plots.R b/bin/kinc-make-plots.R index ffceaca..e37a0df 100755 --- a/bin/kinc-make-plots.R +++ b/bin/kinc-make-plots.R @@ -6,7 +6,7 @@ option_list = list( make_option(c("--net"), type="character", default=NULL, help="The path to the KINC-derived network file. It must be in tidy format.", metavar="character"), - make_option(c("--out_prefix"), type="character", default=NULL, + make_option(c("--out_prefix"), type="character", default=NULL, help="(optional). The file name prefix used for the ouput files. If this arugment is not provided then the original network file name is used as a prefix.", metavar="character") ); @@ -18,6 +18,10 @@ if (is.null(opt$net)){ stop("Please provide a network file (--net).", call.=FALSE) } +# Make sure KINC.R is at the correct vresion. +if(packageVersion("KINC.R") < "1.1") { + stop("This script requires KINC.R > 1.1") +} suppressMessages(library("KINC.R")) message("") diff --git a/docker/Dockerfile b/docker/Dockerfile index b377b7a..8d83ed6 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -3,7 +3,8 @@ MAINTAINER Ben Shealy ARG NVIDIA_HEADLESS=0 ARG ACE_REVISION="develop" -ARG KINC_REVISION="master" +ARG KINC_REVISION="develop" +ARG KINC_R_REVISION="master" ENV CUDADIR="/usr/local/cuda" ENV CPLUS_INCLUDE_PATH="${CUDADIR}/include:${CPLUS_INCLUDE_PATH}" @@ -14,19 +15,24 @@ ENV LD_LIBRARY_PATH="${CUDADIR}/lib64:${LD_LIBRARY_PATH}" RUN apt-get update -qq \ && apt-get install -qq -y \ qt5-default \ - clinfo git libgsl-dev liblapacke-dev libopenblas-dev libopenmpi-dev ocl-icd-opencl-dev \ + clinfo \ + git \ + libboost-dev \ + libgsl-dev \ + liblapacke-dev \ + libopenblas-dev \ + libopenmpi-dev \ + ocl-icd-opencl-dev \ python3-pip -# install headless driver for cpu image + +# Install headless driver for cpu image RUN if [ ${NVIDIA_HEADLESS} = 1 ]; then apt-get install -qq -y nvidia-headless-418 ; fi # add NVIDIA platform to OpenCL RUN mkdir -p /etc/OpenCL/vendors \ && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd -# install python dependencies -RUN pip3 install -q argparse matplotlib numpy pandas scipy seaborn - # install StatsLib WORKDIR /opt @@ -39,11 +45,10 @@ RUN git clone -q https://github.com/kthohr/stats \ # install ACE WORKDIR /opt -ADD https://api.github.com/repos/SystemsGenetics/ACE/git/refs/heads/${ACE_REVISION} ace-version.json RUN git clone -q https://github.com/SystemsGenetics/ACE.git \ && cd ACE/build \ && git checkout -q ${ACE_REVISION} \ - && qmake ../src/ACE.pro GUI=no \ + && qmake ../src/ACE.pro \ && make -s -j $(nproc) \ && make -s qmake_all \ && make -s install @@ -53,14 +58,69 @@ ENV LD_LIBRARY_PATH "/usr/local/lib:$LD_LIBRARY_PATH" # install KINC WORKDIR /opt -ADD https://api.github.com/repos/SystemsGenetics/KINC/git/refs/heads/${KINC_REVISION} kinc-version.json -RUN git clone -q https://github.com/SystemsGenetics/KINC.git \ - && cd KINC/build \ +# For older version of KINC we need to use qmake. For newer versions after 3.3.x we +# can use the make file in the root directory of KINC. +RUN if echo "${KINC_REVISION}" | grep -Eq 'v3\.[32]\.[0-9]$' ; then \ + git clone -q https://github.com/SystemsGenetics/KINC.git \ + && cd KINC \ && git checkout -q ${KINC_REVISION} \ - && qmake ../src/KINC.pro GUI=no \ + && cd build \ + && qmake ../src/KINC.pro \ && make -s -j $(nproc) \ && make -s qmake_all \ - && make -s install + && make -s install; \ +else \ + git clone -q https://github.com/SystemsGenetics/KINC.git \ + && cd KINC \ + && git checkout -q ${KINC_REVISION} \ + && make -s -j $(nproc) \ + && make -s install; \ +fi + + +# Add in a few additional requirements for the >=3.4 version of KINC. These are needed +# for the R and Python scripts in the bin folder. +RUN if ! echo "${KINC_REVISION}" | grep -Eq 'v3\.[32]\.[0-9]$' ; then \ + apt-get install -qq -y \ + m4 \ + autoconf \ + automake \ + libxml2-dev \ + bison \ + flex \ + build-essential \ + libcurl4-gnutls-dev \ + libssl-dev \ + software-properties-common; \ +fi + +# Install KINC.R +ENV DEBIAN_FRONTEND=noninteractive +RUN if ! echo "${KINC_REVISION}" | grep -Eq 'v3\.[32]\.[0-9]$' ; then \ + apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 \ + && add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/' \ + && apt-get update -qq \ + && apt-get -qq install -y r-base \ + && R -q -e "install.packages('devtools', dependencies=TRUE, repos='http://cran.us.r-project.org')" \ + && R -e "library('devtools'); install_github('SystemsGenetics/KINC.R', ref='${KINC_R_REVISION}')"; \ +fi + +# install python dependencies +WORKDIR /opt/KINC + +RUN if [ -e requirements.txt ]; then pip3 install -r requirements.txt; fi + + # initialize default work directory WORKDIR /workspace + +# Tini for signal processing and zombie killing. +ENV TINI_VERSION v0.19.0 +ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini +RUN chmod +x /tini + + +# Define the command and parameters that will be executed when this +# container is first run. +ENTRYPOINT ["/tini", "--"] diff --git a/docker/Makefile b/docker/Makefile index 04138c8..e7f5844 100644 --- a/docker/Makefile +++ b/docker/Makefile @@ -1,52 +1,121 @@ -all: +all: develop latest v3.4.x v3.3.x v3.2.x + +develop: develop-cpu develop-gpu + +develop-cpu: + docker build \ + -t systemsgenetics/kinc:develop-cpu \ + --build-arg NVIDIA_HEADLESS=1 \ + --build-arg ACE_REVISION=v3.2.0 \ + --build-arg KINC_REVISION=develop \ + --build-arg KINC_R_REVISION=v1.2 \ + . + +develop-gpu: + docker build \ + -t systemsgenetics/kinc:develop-gpu \ + --build-arg NVIDIA_HEADLESS=0 \ + --build-arg ACE_REVISION=v3.2.0 \ + --build-arg KINC_REVISION=develop \ + --build-arg KINC_R_REVISION=v1.2 \ + . + + +latest: latest-cpu latest-gpu + +latest-cpu: docker build \ -t systemsgenetics/kinc:latest-cpu \ --build-arg NVIDIA_HEADLESS=1 \ - --build-arg ACE_REVISION=develop \ - --build-arg KINC_REVISION=master \ + --build-arg ACE_REVISION=v3.2.0 \ + --build-arg KINC_REVISION=v3.4.2 \ + --build-arg KINC_R_REVISION=v1.2 \ . + +latest-gpu: docker build \ -t systemsgenetics/kinc:latest-gpu \ --build-arg NVIDIA_HEADLESS=0 \ - --build-arg ACE_REVISION=develop \ - --build-arg KINC_REVISION=master \ + --build-arg ACE_REVISION=v3.2.0 \ + --build-arg KINC_REVISION=v3.4.2 \ + --build-arg KINC_R_REVISION=v1.2 \ . + +v3.4.x: v3.4.x-cpu v3.4.x-gpu + +v3.4.x-cpu: + docker build \ + -t systemsgenetics/kinc:3.4.2-cpu \ + --build-arg NVIDIA_HEADLESS=1 \ + --build-arg ACE_REVISION=v3.2.0 \ + --build-arg KINC_REVISION=v3.4.2 \ + --build-arg KINC_R_REVISION=v1.2 \ + . + +v3.4.x-gpu: + docker build \ + -t systemsgenetics/kinc:3.4.2-gpu \ + --build-arg NVIDIA_HEADLESS=0 \ + --build-arg ACE_REVISION=v3.2.0 \ + --build-arg KINC_REVISION=v3.4.2 \ + --build-arg KINC_R_REVISION=v1.2 \ + . + +v3.3.x: v3.3.x-cpu v3.3.x-gpu + +v3.3.x-cpu: docker build \ -t systemsgenetics/kinc:3.3.0-cpu \ --build-arg NVIDIA_HEADLESS=1 \ --build-arg ACE_REVISION=v3.1.0 \ --build-arg KINC_REVISION=v3.3.0 \ . + +v3.3.x-gpu: docker build \ -t systemsgenetics/kinc:3.3.0-gpu \ --build-arg NVIDIA_HEADLESS=0 \ --build-arg ACE_REVISION=v3.1.0 \ --build-arg KINC_REVISION=v3.3.0 \ . + +v3.2.x: docker build \ -t systemsgenetics/kinc:3.2.2 \ - --build-arg NVIDIA_HEADLESS=0 \ + --build-arg NVIDIA_HEADLESS=1 \ --build-arg ACE_REVISION=v3.0.2 \ --build-arg KINC_REVISION=v3.2.2 \ . push: + docker push systemsgenetics/kinc:develop-cpu + docker push systemsgenetics/kinc:develop-gpu docker push systemsgenetics/kinc:latest-cpu docker push systemsgenetics/kinc:latest-gpu + docker push systemsgenetics/kinc:3.4.2-cpu + docker push systemsgenetics/kinc:3.4.2-gpu docker push systemsgenetics/kinc:3.3.0-cpu docker push systemsgenetics/kinc:3.3.0-gpu docker push systemsgenetics/kinc:3.2.2 pull: + docker pull systemsgenetics/kinc:develop-cpu + docker pull systemsgenetics/kinc:develop-gpu docker pull systemsgenetics/kinc:latest-cpu docker pull systemsgenetics/kinc:latest-gpu + docker pull systemsgenetics/kinc:3.4.2-cpu + docker pull systemsgenetics/kinc:3.4.2-gpu docker pull systemsgenetics/kinc:3.3.0-cpu docker pull systemsgenetics/kinc:3.3.0-gpu docker pull systemsgenetics/kinc:3.2.2 clean: + docker image rm -f systemsgenetics/kinc:develop-cpu + docker image rm -f systemsgenetics/kinc:develop-gpu docker image rm -f systemsgenetics/kinc:latest-cpu docker image rm -f systemsgenetics/kinc:latest-gpu + docker image rm -f systemsgenetics/kinc:3.4.2-cpu + docker image rm -f systemsgenetics/kinc:3.4.2-gpu docker image rm -f systemsgenetics/kinc:3.3.0-cpu docker image rm -f systemsgenetics/kinc:3.3.0-gpu docker image rm -f systemsgenetics/kinc:3.2.2 diff --git a/docs/conf.py b/docs/conf.py index 219bb57..8d54f37 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -35,7 +35,7 @@ # The short X.Y version version = u'' # The full version, including alpha/beta/rc tags -release = u'v3.4.1' +release = u'v3.4.2' # -- General configuration --------------------------------------------------- diff --git a/docs/create_a_network.rst b/docs/create_a_network.rst index 6908f9e..7c625f3 100644 --- a/docs/create_a_network.rst +++ b/docs/create_a_network.rst @@ -316,10 +316,11 @@ When using the GMM approach, the goal is to identiy condition-specific subgraphs As in previous steps, the ``--emx``, ``--cmx``, ``--ccm`` and ``--csm`` arguments provide the expression matrix, correlation, clustering matrix and the new condition-specific matrix. A threshold is provided to the ``--mincorr`` argument typically as a lower-bound. No edges with absolute correlation values below this value will be extracted. Additinally, if you would like to exclude high correlations (such as perfect correlations), you can do so with the ``--maxcorr`` argument. You should only need to change the ``--maxcorr`` argument if it was determined that there is error in the data resulting from an inordinate number of high correlations. In the example above the ``--mincorr`` is set at 0.5. This is quite low by traditional standards but the following filtering and thresholding steps support exploration of edges at such a low correlation. -To limit the size of the condition-specific subgraphs you should then set the ``--filter-pvalue`` and ``--filter-rsquare`` values to lower-bounds for signficant p-values and meaningful r-square values from test. The r-square values are only present for quantitative features where the regression test was performed. The p-value in this case indicates how well the data follows a trend and the r-square indicates how much of the variation the trend line accounts for. Ideally, low p-values and high r-squre are desired. However, there are no rules for the best setting, but choose settings that provide a signficance level you are comfortable with. +To limit the size of the condition-specific subgraphs you should then set the ``--filter-pvalue`` and ``--filter-rsquare`` values to lower-bounds for signficant p-values and meaningful R-square values from test. The R-square values are only present for quantitative features where the regression test was performed. The p-value in this case indicates how well the data follows a trend and the r-square indicates how much of the variation the trend line accounts for. Ideally, low p-values and high r-squre are desired. However, there are no rules for the best setting, but choose settings that provide a signficance level you are comfortable with. Finally, the ``--format`` argument can be ``tidy``, ``text``, ``minimal`` or ``graphml``. The ``tidy`` format is recommended for use by later steps. The the `GraphML `_ version is larger in size and in an XML format compatible with other graph tools. The ``tidy``, ``test`` and ``graphml`` formats are easily imported into Cytoscape. The ``minimal`` format contains the list of edges with only the two genes and the correlation value. See the :ref:`plain-text-reference-label` section for specific details about these files. + Complex Filtering ::::::::::::::::: @@ -443,17 +444,7 @@ The result from the command-above is a set of files, one per condition class whe Step 9: Visualization ````````````````````` -You can visualize the network using 2D layout tools such as `Cytocape`_, which is a feature rich 2D viauliation software packge. However, KINC includes a Python v3 Dash-based application that can be used for 3D visualization of the network. You must have the following Python v3 libraries installed prior to using this viewer: - - - argparse - - numpy - - pandas - - igraph - - plotly - - seaborn - - fa2 (ForceAtlas2) - - dash - - progress.bar +You can visualize the network using 2D layout tools such as `Cytocape`_, which is a feature rich 2D viauliation software packge. However, KINC includes a Python v3 Dash-based application that can be used for 3D visualization of the network. Please see the :ref:`_installation_reference_label` for the list of Python libraries that are required. The following is an example for launching the viewer for the network containing all condition-specific subgraphs: diff --git a/docs/installation.rst b/docs/installation.rst index 5d70aac..ae6ff04 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -7,6 +7,8 @@ Even if you install KINC on an HPC system, you may want to install KINC on a loc Additionally, KINC can also be run in a Docker container. Consult the :doc:`usage` section for more information. +.. _installation_reference_label: + Dependencies ------------ @@ -22,15 +24,12 @@ KINC requires the following software packages. - `LAPACK `_ - `GCEM `_ - `StatsLib `_ -- `KINC.R v1.0 `_ (for post filtering of networks) -- Python v3 and A variety of Python modules for the 3D visualization. - - -Some functionality of KINC (i.e. condition-specific network construction) require two additional set of dependences: `KINC.R v1.0 `_ , Python3 and a variety of Python modules for the 3D visualization. +- `Boost C++ libraries `_ +Some functionality of KINC (i.e. condition-specific network construction) require two additional set of dependences: `KINC.R v1.1 `_ , Python3 and a variety of Python modules for the 3D visualization. To install KINC.R please follow the installation instructions on the `KINC.R repository `_. KINC.R requires a variety of other R modules. -If you desire to use the Python v3 `Plotly Dash `_ 3D visualization script you must also install the following packages: +If you desire to use the Python v3 `Plotly Dash `_ 3D visualization script you must also install the following packages: - `Numpy `_ - `Pandas `_ @@ -44,7 +43,7 @@ If you desire to use the Python v3 `Plotly Dash `_ 3D Install these Python v3 packages using your favorite package manager (i.e. pip, Anaconda, etc.) Installing KINC on Ubuntu 18.04 --------------------------- +------------------------------- Install Dependencies ~~~~~~~~~~~~~~~~~~~~ @@ -60,7 +59,8 @@ Most dependencies are available as packages via Ubuntu and can be installed usin libopenmpi-dev \ ocl-icd-opencl-dev \ liblapacke-dev \ - nvidia-cuda-toolkit + nvidia-cuda-toolkit \ + libboost-dev Additionally, since KINC uses the NVIDIA Driver API, you must install either the appropriate NVIDIA drivers for your system or the NVIDIA headless driver if you don't have a GPU: @@ -154,7 +154,7 @@ Select a suitable `version of KINC `_ w The KINC docker image comes pre-installed with all dependencies. The Dockerfile for KINC is available in the KINC Github repository, and Docker images are maintained on DockerHub under ``systemsgenetics/kinc``. This method currently does not support the GUI version of KINC. -To use KINC in an interactive Docker container execute the following: +Interactive Mode +```````````````` +To use KINC in an interactive Docker container **with GPUs** execute the following: + + +.. code:: bash + + docker run --gpus all --rm -it --net=host -v ${PWD}:/workspace -u $(id -u ${USER}) systemsgenetics/kinc:3.4.2-gpu /bin/bash + +The command above will provide access to the terminal inside of the `systemsgenetics/kinc:3.4.2-gpu` image. + +To use KINC in an interactive Docker container **without GPUs** execute the following: .. code:: bash - nvidia-docker run --rm -it systemsgenetics/kinc:3.4.1 bash + docker run --gpus all --rm -it --net=host -v ${PWD}:/workspace -u $(id -u ${USER}) systemsgenetics/kinc:3.4.2-cpu /bin/bash + +The above command uses the image `systemsgenetics/kinc:3.4.2-cpu` (note the "-cpu" suffix). + +The following describes the meaning of each argument in the commands above: -The command above will provide access to the terminal inside of the image where commands such as the following can be executed: +- `--gpus all`: ensures that the NVidia CUDA libraries are present in the image. This is needed even if using only CPUs. +- `--rm`: will cause the container to be cleaned up after you exit. +- `--it`: tells Docker to run in interactive mode with a terminal +- `--net=host`: exposes network ports in the image to your local machine. This is needed to use the Docker image for the 3D KINC viewer. +- `-v ${PWD}:/workspace`: adds the current directory on the local machine as a filesystem in the image accessible via `/workspace`. This allows you to work with files on your local machine inside of the image. +- `-u $(id -u ${USER})`: logs you in as your local user account so that any files created within directories mounted by the image are saved with your user account. + +Once inside the KINC Docker image you can execute commands such as the following: .. code:: bash > nvidia-smi > kinc settings -You will need to share the input and output data between the Docker container and the host machine, which can be done by mounting a directory with the ``-v`` argument. The example below mounts the current directory specified by the `$PWD` environment variable onto the `/root` directory of the image: +Test The Example Data +````````````````````` + +To test the docker image using the example data that comes with KINC. Run the following inside of the KINC source directory on your local machine: .. code:: bash - nvidia-docker run --rm -it -v $PWD:/root systemsgenetics/kinc:3.4.1 bash - > ls + docker run --gpus all --rm -it --net=host -v ${PWD}:/workspace -u $(id -u ${USER}) systemsgenetics/kinc:3.4.2-cpu /bin/bash + +Next run the following commands inside of the container: + +.. code:: bash + + cd /workspace/example + ./kinc-gmm-run.sh + +Using the 3D Visualization Tool +``````````````````````````````` +To use the 3D visaulization tool to explore a KINC created network first start an interactive session in the directory where the network file(s) are stored: + +.. code:: bash + + docker run --gpus all --rm -it --net=host -v ${PWD}:/workspace -u $(id -u ${USER}) systemsgenetics/kinc:3.4.2-cpu /bin/bash + +Then run the `kinc-3d-viewer.py`. For example, if the example data is already fully processed using the steps in the previous section you can view the results with the viewer with the following commands. + +.. code:: bash + + cd /workspace/example/results-kinc-gmm-run + kinc-3d-viewer.py \ + --net "PRJNA301554.slim.GEM.log2.paf-th0.00-p1e-3-rsqr0.30-filtered-th_ranked.csGCN.txt" \ + --emx "../data/PRJNA301554.slim.GEM.log2.txt" \ + --amx "../data/PRJNA301554.slim.annotations.txt" + +The first time the viewer is run you will see output about creating 2D and 3D layouts. Once completed you will see a line of text similar to the following: + +.. code:: + + * Running on http://127.0.0.1:8050/ (Press CTRL+C to quit) + +Copy the URL into a web browser and view the network. Automating KINC with Nextflow ----------------------------- diff --git a/example/PRJNA301554.slim.GEM.log2.txt b/example/data/PRJNA301554.slim.GEM.log2.txt similarity index 100% rename from example/PRJNA301554.slim.GEM.log2.txt rename to example/data/PRJNA301554.slim.GEM.log2.txt diff --git a/example/PRJNA301554.slim.annotations.txt b/example/data/PRJNA301554.slim.annotations.txt similarity index 100% rename from example/PRJNA301554.slim.annotations.txt rename to example/data/PRJNA301554.slim.annotations.txt diff --git a/example/kinc-gmm-all-formats.sh b/example/kinc-gmm-all-formats.sh new file mode 100755 index 0000000..a34bbaa --- /dev/null +++ b/example/kinc-gmm-all-formats.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Run this after any KINC run to extract the various formats. + +# Step 5: Extract the network. +echo "Extracting the condition-specific network." + +p="1e-3" +r2="0.30" +th="0.00" + +formats="tidy minimal graphml text" + +for format in $formats; do + + kinc run extract \ + --emx "PRJNA301554.slim.GEM.log2.emx" \ + --ccm "PRJNA301554.slim.GEM.log2.paf.ccm" \ + --cmx "PRJNA301554.slim.GEM.log2.paf.cmx" \ + --csm "PRJNA301554.slim.GEM.log2.paf.csm" \ + --format $format \ + --output "PRJNA301554.slim.GEM.log2.paf-th${th}-p${p}-rsqr${r2}.${format}.txt" \ + --mincorr $th \ + --maxcorr 1 \ + --filter-pvalue $p \ + --filter-rsquare $r2 + +done; diff --git a/example/kinc-gmm-chunkrun.sh b/example/kinc-gmm-chunkrun.sh index 628a5bf..40f5b20 100755 --- a/example/kinc-gmm-chunkrun.sh +++ b/example/kinc-gmm-chunkrun.sh @@ -1,5 +1,7 @@ #!/bin/bash # Perform a GMM run of KINC +mkdir -p results-kinc-gmm-chunkrun +cd results-kinc-gmm-chunkrun # Initialize number of chunks NP=4 @@ -14,7 +16,7 @@ kinc settings set logging off echo "Importing the gene expression matrix (GEM) for the slimmed experiment PRJNA301554" kinc run import-emx \ - --input "PRJNA301554.slim.GEM.log2.txt" \ + --input "../data/PRJNA301554.slim.GEM.log2.txt" \ --output "PRJNA301554.slim.GEM.log2.emx" \ --nan "NA" \ --samples 0 @@ -77,10 +79,12 @@ kinc run cond-test \ --emx "PRJNA301554.slim.GEM.log2.emx" \ --ccm "PRJNA301554.slim.GEM.log2.paf.ccm" \ --cmx "PRJNA301554.slim.GEM.log2.paf.cmx" \ - --amx "PRJNA301554.slim.annotations.txt" \ + --amx "../data/PRJNA301554.slim.annotations.txt" \ --output "PRJNA301554.slim.GEM.log2.paf.csm" \ --feat-tests "Subspecies,Treatment,GTAbbr,Duration" \ - --feat-types "Duration:quantitative" + --feat-types "Duration:quantitative" \ + --alpha 0.001 \ + --power 0.8 # Step 5: Extract the network. echo "Extracting the condition-specific network." @@ -106,7 +110,7 @@ echo "Removing biased edges from the extract network using KINC.R." kinc-filter-bias.R \ --net "PRJNA301554.slim.GEM.log2.paf-th0.00-p1e-3-rsqr0.30.txt" \ - --emx "PRJNA301554.slim.GEM.log2.txt" \ + --emx "../data/PRJNA301554.slim.GEM.log2.txt" \ --out_prefix "PRJNA301554.slim.GEM.log2.paf-th0.00-p1e-3-rsqr0.30" # Step 7: Generate summary plots @@ -122,22 +126,22 @@ echo "Generating condition-specific network files by class and label." kinc-filter-rank.R \ --net "PRJNA301554.slim.GEM.log2.paf-th0.00-p1e-3-rsqr0.30-filtered.GCN.txt" \ --out_prefix "PRJNA301554.slim.GEM.log2.paf-th0.00-p1e-3-rsqr0.30-filtered" \ - --top_n 26035 + --top_n 25000 kinc-filter-rank.R \ --net "PRJNA301554.slim.GEM.log2.paf-th0.00-p1e-3-rsqr0.30-filtered.GCN.txt" \ --out_prefix "PRJNA301554.slim.GEM.log2.paf-th0.00-p1e-3-rsqr0.30-filtered" \ --save_condition_networks \ - --top_n 26035 + --top_n 25000 kinc-filter-rank.R \ --net "PRJNA301554.slim.GEM.log2.paf-th0.00-p1e-3-rsqr0.30-filtered.GCN.txt" \ --out_prefix "PRJNA301554.slim.GEM.log2.paf-th0.00-p1e-3-rsqr0.30-filtered" \ --save_condition_networks --unique_filter "label" \ - --top_n 26035 + --top_n 25000 kinc-filter-rank.R \ --net "PRJNA301554.slim.GEM.log2.paf-th0.00-p1e-3-rsqr0.30-filtered.GCN.txt" \ --out_prefix "PRJNA301554.slim.GEM.log2.paf-th0.00-p1e-3-rsqr0.30-filtered" \ --save_condition_networks --unique_filter "class" \ - --top_n 26035 + --top_n 25000 diff --git a/example/kinc-gmm-mpirun.sh b/example/kinc-gmm-mpirun.sh index c6578f5..cf2ed53 100755 --- a/example/kinc-gmm-mpirun.sh +++ b/example/kinc-gmm-mpirun.sh @@ -1,5 +1,7 @@ #!/bin/bash # Perform a GMM run of KINC +mkdir -p results-kinc-gmm-mpirun +cd results-kinc-gmm-mpirun # Initialize number of MPI ranks NP=4 @@ -14,7 +16,7 @@ kinc settings set logging off echo "Importing the gene expression matrix (GEM) for the slimmed experiment PRJNA301554" kinc run import-emx \ - --input "PRJNA301554.slim.GEM.log2.txt" \ + --input "../data/PRJNA301554.slim.GEM.log2.txt" \ --output "PRJNA301554.slim.GEM.log2.emx" \ --nan "NA" \ --samples 0 @@ -56,10 +58,12 @@ mpiexec -np ${NP} kinc run cond-test \ --emx "PRJNA301554.slim.GEM.log2.emx" \ --ccm "PRJNA301554.slim.GEM.log2.paf.ccm" \ --cmx "PRJNA301554.slim.GEM.log2.paf.cmx" \ - --amx "PRJNA301554.slim.annotations.txt" \ + --amx "../data/PRJNA301554.slim.annotations.txt" \ --output "PRJNA301554.slim.GEM.log2.paf.csm" \ --feat-tests "Subspecies,Treatment,GTAbbr,Duration" \ - --feat-types "Duration:quantitative" + --feat-types "Duration:quantitative" \ + --alpha 0.001 \ + --power 0.8 # Step 5: Extract the network. echo "Extracting the condition-specific network." @@ -85,7 +89,7 @@ echo "Removing biased edges from the extract network using KINC.R." kinc-filter-bias.R \ --net "PRJNA301554.slim.GEM.log2.paf-th0.00-p1e-3-rsqr0.30.txt" \ - --emx "PRJNA301554.slim.GEM.log2.txt" \ + --emx "../data/PRJNA301554.slim.GEM.log2.txt" \ --out_prefix "PRJNA301554.slim.GEM.log2.paf-th0.00-p1e-3-rsqr0.30" # Step 7: Generate summary plots @@ -101,22 +105,22 @@ echo "Generating condition-specific network files by class and label." kinc-filter-rank.R \ --net "PRJNA301554.slim.GEM.log2.paf-th0.00-p1e-3-rsqr0.30-filtered.GCN.txt" \ --out_prefix "PRJNA301554.slim.GEM.log2.paf-th0.00-p1e-3-rsqr0.30-filtered" \ - --top_n 26035 + --top_n 25000 kinc-filter-rank.R \ --net "PRJNA301554.slim.GEM.log2.paf-th0.00-p1e-3-rsqr0.30-filtered.GCN.txt" \ --out_prefix "PRJNA301554.slim.GEM.log2.paf-th0.00-p1e-3-rsqr0.30-filtered" \ --save_condition_networks \ - --top_n 26035 + --top_n 25000 kinc-filter-rank.R \ --net "PRJNA301554.slim.GEM.log2.paf-th0.00-p1e-3-rsqr0.30-filtered.GCN.txt" \ --out_prefix "PRJNA301554.slim.GEM.log2.paf-th0.00-p1e-3-rsqr0.30-filtered" \ --save_condition_networks --unique_filter "label" \ - --top_n 26035 + --top_n 25000 kinc-filter-rank.R \ --net "PRJNA301554.slim.GEM.log2.paf-th0.00-p1e-3-rsqr0.30-filtered.GCN.txt" \ --out_prefix "PRJNA301554.slim.GEM.log2.paf-th0.00-p1e-3-rsqr0.30-filtered" \ --save_condition_networks --unique_filter "class" \ - --top_n 26035 + --top_n 25000 diff --git a/example/kinc-gmm-run.sh b/example/kinc-gmm-run.sh index f90de9c..b8ffd88 100755 --- a/example/kinc-gmm-run.sh +++ b/example/kinc-gmm-run.sh @@ -1,5 +1,7 @@ #!/bin/bash # Perform a GMM run of KINC +mkdir -p results-kinc-gmm-run +cd results-kinc-gmm-run # Initialize KINC settings kinc settings set cuda 0 @@ -11,7 +13,7 @@ kinc settings set logging off echo "Importing the gene expression matrix (GEM) for the slimmed experiment PRJNA301554" kinc run import-emx \ - --input "PRJNA301554.slim.GEM.log2.txt" \ + --input "../data/PRJNA301554.slim.GEM.log2.txt" \ --output "PRJNA301554.slim.GEM.log2.emx" \ --nan "NA" \ --samples 0 @@ -33,7 +35,7 @@ kinc run similarity \ --preout TRUE \ --postout TRUE \ --mincorr 0 \ - --maxcorr 1 + --maxcorr 1 # Step 3: Filter for clusters with low power. echo "Filtering edges with insufficient statistical power." @@ -53,10 +55,12 @@ kinc run cond-test \ --emx "PRJNA301554.slim.GEM.log2.emx" \ --ccm "PRJNA301554.slim.GEM.log2.paf.ccm" \ --cmx "PRJNA301554.slim.GEM.log2.paf.cmx" \ - --amx "PRJNA301554.slim.annotations.txt" \ + --amx "../data/PRJNA301554.slim.annotations.txt" \ --output "PRJNA301554.slim.GEM.log2.paf.csm" \ --feat-tests "Subspecies,Treatment,GTAbbr,Duration" \ - --feat-types "Duration:quantitative" + --feat-types "Duration:quantitative" \ + --alpha 0.001 \ + --power 0.8 # Step 5: Extract the network. echo "Extracting the condition-specific network." @@ -82,7 +86,7 @@ echo "Removing biased edges from the extract network using KINC.R." kinc-filter-bias.R \ --net "PRJNA301554.slim.GEM.log2.paf-th0.00-p1e-3-rsqr0.30.txt" \ - --emx "PRJNA301554.slim.GEM.log2.txt" \ + --emx "../data/PRJNA301554.slim.GEM.log2.txt" \ --out_prefix "PRJNA301554.slim.GEM.log2.paf-th0.00-p1e-3-rsqr0.30" # Step 7: Generate summary plots @@ -98,22 +102,22 @@ echo "Generating condition-specific network files by class and label." kinc-filter-rank.R \ --net "PRJNA301554.slim.GEM.log2.paf-th0.00-p1e-3-rsqr0.30-filtered.GCN.txt" \ --out_prefix "PRJNA301554.slim.GEM.log2.paf-th0.00-p1e-3-rsqr0.30-filtered" \ - --top_n 26035 + --top_n 25000 kinc-filter-rank.R \ --net "PRJNA301554.slim.GEM.log2.paf-th0.00-p1e-3-rsqr0.30-filtered.GCN.txt" \ --out_prefix "PRJNA301554.slim.GEM.log2.paf-th0.00-p1e-3-rsqr0.30-filtered" \ --save_condition_networks \ - --top_n 26035 + --top_n 25000 kinc-filter-rank.R \ --net "PRJNA301554.slim.GEM.log2.paf-th0.00-p1e-3-rsqr0.30-filtered.GCN.txt" \ --out_prefix "PRJNA301554.slim.GEM.log2.paf-th0.00-p1e-3-rsqr0.30-filtered" \ --save_condition_networks --unique_filter "label" \ - --top_n 26035 + --top_n 25000 kinc-filter-rank.R \ --net "PRJNA301554.slim.GEM.log2.paf-th0.00-p1e-3-rsqr0.30-filtered.GCN.txt" \ --out_prefix "PRJNA301554.slim.GEM.log2.paf-th0.00-p1e-3-rsqr0.30-filtered" \ --save_condition_networks --unique_filter "class" \ - --top_n 26035 + --top_n 25000 diff --git a/example/kinc-gmm-view-duration.sh b/example/kinc-gmm-view-duration.sh index 0f4989c..0c4729b 100755 --- a/example/kinc-gmm-view-duration.sh +++ b/example/kinc-gmm-view-duration.sh @@ -3,5 +3,5 @@ kinc-3d-viewer.py \ --net "PRJNA301554.slim.GEM.log2.paf-th0.00-p1e-3-rsqr0.30-filtered-th_ranked.Duration-unique_class.csGCN.txt" \ - --emx "PRJNA301554.slim.GEM.log2.txt" \ - --amx "PRJNA301554.slim.annotations.txt" + --emx "../data/PRJNA301554.slim.GEM.log2.txt" \ + --amx "../data/PRJNA301554.slim.annotations.txt" diff --git a/example/kinc-gmm-view-full.sh b/example/kinc-gmm-view-full.sh index e664518..872c107 100755 --- a/example/kinc-gmm-view-full.sh +++ b/example/kinc-gmm-view-full.sh @@ -3,5 +3,5 @@ kinc-3d-viewer.py \ --net "PRJNA301554.slim.GEM.log2.paf-th0.00-p1e-3-rsqr0.30-filtered-th_ranked.csGCN.txt" \ - --emx "PRJNA301554.slim.GEM.log2.txt" \ - --amx "PRJNA301554.slim.annotations.txt" + --emx "../data/PRJNA301554.slim.GEM.log2.txt" \ + --amx "../data/PRJNA301554.slim.annotations.txt" diff --git a/example/kinc-gmm-view-treatment.sh b/example/kinc-gmm-view-treatment.sh index 5462a94..5721d3b 100755 --- a/example/kinc-gmm-view-treatment.sh +++ b/example/kinc-gmm-view-treatment.sh @@ -3,5 +3,5 @@ kinc-3d-viewer.py \ --net "PRJNA301554.slim.GEM.log2.paf-th0.00-p1e-3-rsqr0.30-filtered-th_ranked.Treatment-unique_class.csGCN.txt" \ - --emx "PRJNA301554.slim.GEM.log2.txt" \ - --amx "PRJNA301554.slim.annotations.txt" + --emx "../data/PRJNA301554.slim.GEM.log2.txt" \ + --amx "../data/PRJNA301554.slim.annotations.txt" diff --git a/example/kinc-traditional-run.sh b/example/kinc-traditional-run.sh index c3c465a..1bfe9ef 100755 --- a/example/kinc-traditional-run.sh +++ b/example/kinc-traditional-run.sh @@ -1,11 +1,13 @@ #!/bin/bash # Perform a traditional run of KINC +mkdir -p results-kinc-traditional +cd results-kinc-traditional # Step 1: Import the similarity matrix echo "Importing the gene expression matrix (GEM) for the slimmed experiment PRJNA301554" kinc run import-emx \ - --input "PRJNA301554.slim.GEM.log2.txt" \ + --input "../data/PRJNA301554.slim.GEM.log2.txt" \ --output "PRJNA301554.slim.GEM.log2.emx" \ --nan "NA" \ --samples 0 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..5340116 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,11 @@ +argparse +matplotlib +numpy +pandas +scipy +seaborn +python-igraph +plotly +fa2 +dash +progress diff --git a/scripts/kinc-docker.sh b/scripts/kinc-docker.sh new file mode 100755 index 0000000..67606ff --- /dev/null +++ b/scripts/kinc-docker.sh @@ -0,0 +1,12 @@ +#!/bin/bash +# example: ./kinc-docker.sh cuda 4 Yeast.emx.txt + +MODE="$1" +NP="$2" +INFILE="$3" + +nvidia-docker run \ + --rm -it \ + -v ${PWD}:/workspace \ + systemsgenetics/kinc:latest-gpu \ + bash -c "./kinc.sh ${MODE} ${NP} ${INFILE}; chown $(stat -c '%u:%g' .) *" diff --git a/src/KINC.pri b/src/KINC.pri index 59dbd24..97fd7a4 100644 --- a/src/KINC.pri +++ b/src/KINC.pri @@ -2,7 +2,7 @@ # Versions KINC_MAJOR_VERSION = 3 KINC_MINOR_VERSION = 4 -KINC_REVISION = 1 +KINC_REVISION = 2 VERSION = $${KINC_MAJOR_VERSION}.$${KINC_MINOR_VERSION}.$${KINC_REVISION} diff --git a/src/core/conditionaltest.cpp b/src/core/conditionaltest.cpp index 2084591..3a9c35d 100644 --- a/src/core/conditionaltest.cpp +++ b/src/core/conditionaltest.cpp @@ -47,10 +47,16 @@ std::unique_ptr ConditionalTest::makeWork(int index) con // initialize pairwise iterator for cmx file CorrelationMatrix::Pair cmxPair(_cmx); + if (index == 0) + { + cmxPair.readFirst(); + } + else + { + cmxPair.read(Pairwise::Index(start)); + } - // iterate to the start index of the next work block - cmxPair.read(Pairwise::Index(start)); - + // Iterate to the start index of the next work block for ( qint64 i = 0; i < size; i++ ) { cmxPair.readNext(); @@ -130,10 +136,8 @@ void ConditionalTest::process(const EAbstractAnalyticBlock* result) } // write the info into the CSM - if ( csmPair.clusterSize() > 0 ) - { - csmPair.write(pair.index); - } + csmPair.write(pair.index); + } } diff --git a/src/core/conditionaltest.h b/src/core/conditionaltest.h index 4a2061c..7e4f9f8 100644 --- a/src/core/conditionaltest.h +++ b/src/core/conditionaltest.h @@ -139,6 +139,15 @@ class ConditionalTest : public EAbstractAnalytic * Cluster information */ QVector> _clusters; + + /*! + * The significance level (i.e. Type I error rate, alpha) for the regression power test. + */ + double _powerThresholdAlpha {0.001}; + /*! + * The power value (i.e. 1 minus Type II error rate, 1 minus beta) for the regression power test. + */ + double _powerThresholdPower {0.8}; }; diff --git a/src/core/conditionaltest_input.cpp b/src/core/conditionaltest_input.cpp index f9abf98..aac73a8 100644 --- a/src/core/conditionaltest_input.cpp +++ b/src/core/conditionaltest_input.cpp @@ -49,9 +49,12 @@ EAbstractAnalyticInput::Type ConditionalTest::Input::type(int index) const case AMXINPUT: return FileIn; case Delimiter: return String; case MISSING: return String; + case NANToken: return String; case CSMOUT: return DataOut; case OVERRIDES: return String; case TEST: return String; + case PowerThresholdAlpha: return Type::Double; + case PowerThresholdPower: return Type::Double; default: return Boolean; } } @@ -123,10 +126,19 @@ QVariant ConditionalTest::Input::data(int index, Role role) const { case CommandLineName: return QString("missing"); case Title: return tr("Missing value:"); - case WhatsThis: return tr("The string that specifies the missing value in the annotation matrix (e.g. NA, 0, 0.0)."); + case WhatsThis: return tr("Deprecated. This option will work but may be removed in future versions. Please use --nan instead."); case Default: return tr("NA"); default: return QVariant(); } + case NANToken: + switch (role) + { + case Role::CommandLineName: return QString("nan"); + case Role::Title: return tr("NAN Token:"); + case Role::WhatsThis: return tr("The string that specifies the missing value in the annotation matrix (e.g. NA, 0, 0.0)."); + case Role::Default: return "NA"; + default: return QVariant(); + } case CSMOUT: EDEBUG_FUNC(this, role); switch(role) @@ -153,6 +165,28 @@ QVariant ConditionalTest::Input::data(int index, Role role) const case WhatsThis: return tr("A comma-separated list of features, with no spaces around commas, from column names of the annotation matrix that should be tested. For example, if the annotation matrix has columns 'Treatment' and 'Subspecies' you can enter: \"Treatment,Subspecies\" Note: column names are case-sensitive."); default: return QVariant(); } + case PowerThresholdAlpha: + switch (role) + { + case Role::CommandLineName: return QString("alpha"); + case Role::Title: return tr("Regression Signficance Level"); + case Role::WhatsThis: return tr("The significance level (i.e. Type I error rate, alpha) for the regression power test. This argument only applies to quantitative variables."); + case Role::Default: return 0.001; + case Role::Minimum: return -std::numeric_limits::infinity(); + case Role::Maximum: return +std::numeric_limits::infinity(); + default: return QVariant(); + } + case PowerThresholdPower: + switch (role) + { + case Role::CommandLineName: return QString("power"); + case Role::Title: return tr("Regression Power of test"); + case Role::WhatsThis: return tr("The power value (i.e. 1 minus Type II error rate, 1 minus beta) for the regression power test. For example, if the desired Type II error rate is 0.2, then this value should be 0.8. . This argument only applies to quantitative variables."); + case Role::Default: return 0.8; + case Role::Minimum: return -std::numeric_limits::infinity(); + case Role::Maximum: return +std::numeric_limits::infinity(); + default: return QVariant(); + } default: return QVariant(); } } @@ -175,6 +209,9 @@ void ConditionalTest::Input::set(int index, const QVariant& value) case Delimiter: _base->_delimiter = value.toString(); break; + case NANToken: + _base->_missing = value.toString(); + break; case MISSING: _base->_missing = value.toString(); break; @@ -184,6 +221,12 @@ void ConditionalTest::Input::set(int index, const QVariant& value) case OVERRIDES: _base->_userTestTypesStr = value.toString(); break; + case PowerThresholdAlpha: + _base->_powerThresholdAlpha = value.toDouble(); + break; + case PowerThresholdPower: + _base->_powerThresholdPower = value.toDouble(); + break; } } diff --git a/src/core/conditionaltest_input.h b/src/core/conditionaltest_input.h index 41f3495..8523b47 100644 --- a/src/core/conditionaltest_input.h +++ b/src/core/conditionaltest_input.h @@ -19,10 +19,13 @@ class ConditionalTest::Input : public EAbstractAnalyticInput ,CMXINPUT ,AMXINPUT ,Delimiter - ,MISSING + ,MISSING /** deprecated in favor of NANToken**/ + ,NANToken ,CSMOUT ,TEST ,OVERRIDES + ,PowerThresholdAlpha + ,PowerThresholdPower ,Total }; public: diff --git a/src/core/conditionaltest_serial.cpp b/src/core/conditionaltest_serial.cpp index c2d5325..ac16488 100644 --- a/src/core/conditionaltest_serial.cpp +++ b/src/core/conditionaltest_serial.cpp @@ -12,7 +12,9 @@ #include #include #include - +#include +#include +#include /*! @@ -55,10 +57,15 @@ std::unique_ptr ConditionalTest::Serial::execute(const E // iterate through each pair in the work block for ( qint64 wbIndex = 0; wbIndex < workBlock->size(); ++wbIndex ) { + // print warning if iterator indices do not match if ( ccmPair.index() != cmxPair.index() ) { - qInfo() << "warning: ccm and cmx files are out of sync"; + QString source = _base->_cmx->geneNames().at(cmxPair.index().getX()).toString(); + QString target = _base->_cmx->geneNames().at(cmxPair.index().getY()).toString(); + qInfo() << "warning: ccm and cmx files are out of sync at cmx coordinate:" + << source << "," << target << " (" + << cmxPair.index().getX() << "," << cmxPair.index().getY() <<")."; } // initialize set of p values and r2 values for each cluster @@ -92,9 +99,9 @@ std::unique_ptr ConditionalTest::Serial::execute(const E // For linear regresssion we need a variable that will hold the // pvalue and the r2 value. QVector results(2); - regression(amx_column, ccmPair, clusterIndex, featureIndex, results); + regression(amx_column, ccmPair, clusterIndex, results); pValues[clusterIndex][test_index] = results.at(0); - r2[clusterIndex][test_index] = results.at(1); + r2[clusterIndex][test_index] = results.at(1); test_index++; } else if ( _base->_testType.at(featureIndex) == CATEGORICAL ) @@ -102,9 +109,10 @@ std::unique_ptr ConditionalTest::Serial::execute(const E // Loop through each label (category) of the feature. for ( qint32 labelIndex = 1; labelIndex < _base->_features.at(featureIndex).size(); labelIndex++ ) { - double results; - hypergeom(amx_column, ccmPair, clusterIndex, featureIndex, labelIndex, results); - pValues[clusterIndex][test_index] = results; + double pval; + //hypergeom(amx_column, ccmPair, clusterIndex, featureIndex, labelIndex, pval); + test_proportions(amx_column, ccmPair, clusterIndex, featureIndex, labelIndex, pval); + pValues[clusterIndex][test_index] = pval; r2[clusterIndex][test_index] = qQNaN(); test_index++; } @@ -114,14 +122,14 @@ std::unique_ptr ConditionalTest::Serial::execute(const E // append pair to result block resultBlock->append(Pair { - ccmPair.index(), + cmxPair.index(), pValues, r2 }); // read the next pair - ccmPair.readNext(); cmxPair.readNext(); + ccmPair.read(cmxPair.index()); } return std::unique_ptr(resultBlock); @@ -156,7 +164,7 @@ bool ConditionalTest::Serial::isEmpty(QVector>& matrix) /*! - * Run the first hypergeometric + * The hypergeometric test * */ void ConditionalTest::Serial::hypergeom( @@ -178,7 +186,7 @@ void ConditionalTest::Serial::hypergeom( // annotation data. int num_samples = _base->_data.at(featureIndex).size(); int cluster_size = 0; - int label_count = 0; + int total_label_count = 0; int labels_in_cluster = 0; for ( int j = 0; j < num_samples; j++ ) { @@ -186,7 +194,7 @@ void ConditionalTest::Serial::hypergeom( // if data is the same as the test label add one to the catagory counter if ( test_type == _base->CATEGORICAL && amx_column[j] == test_label ) { - label_count++; + total_label_count++; } if ( ccmPair.at(clusterIndex, j) == 1 ) { @@ -198,6 +206,14 @@ void ConditionalTest::Serial::hypergeom( } } + // If there are no matching labels in this cluster then return. This + // could happen if the annotation matrix has all NAs for the cluster. + if ( labels_in_cluster == 0 ) + { + result = qQNaN(); + return; + } + // We use the hypergeometric distribution because the samples are // selected from the population for membership in the cluster without @@ -210,9 +226,9 @@ void ConditionalTest::Serial::hypergeom( int sampleSize = _base->_emx->sampleSize(); // Population contains n1 elements of Type 1. - int n1 = label_count; + int n1 = total_label_count; // Population contains n2 elements of Type 2. - int n2 = sampleSize - label_count; + int n2 = sampleSize - total_label_count; // k elements of Type 1 were selected. int k = labels_in_cluster; // t total elements were selected. @@ -289,7 +305,7 @@ void ConditionalTest::Serial::hypergeom( jkap = 0; } - // Now reset the sample size and the proporiton of success. + // The gsl_cdf_hypergeometric_Q function uses the upper-tail of the CDF. result = gsl_cdf_hypergeometric_Q(jkap, n1, n2, bs_t); return; } @@ -307,7 +323,159 @@ void ConditionalTest::Serial::hypergeom( return; } +/*! + * The z test of proprotions + * + */ +void ConditionalTest::Serial::test_proportions ( + const QVector& amx_column, + const CCMatrix::Pair& ccmPair, + int clusterIndex, + int featureIndex, + int labelIndex, + double &result) +{ + EDEBUG_FUNC(this); + + // Get the test name and the test type. + QString test_label = _base->_features.at(featureIndex).at(labelIndex); + TestType test_type = _base->_testType.at(featureIndex); + + /* For debugging purposes + QString source_name = _base->_cmx->geneNames().at(ccmPair.index().getX()).toString(); + QString target_name = _base->_cmx->geneNames().at(ccmPair.index().getY()).toString(); + + if (source_name == "pycom14g13710" && target_name == "pycom02g20220") { + std::cout << "Found it"; + }*/ + + // We need to get the data for the feature being tested and + // count how many entries there are for the label in the + // annotation data. + int num_samples = _base->_data.at(featureIndex).size(); + int cluster_size = 0; + int total_label_count = 0; + int total_non_missing = 0; + int labels_in_cluster = 0; + int labels_out_cluster = 0; + int jx = 0; + for ( int j = 0; j < num_samples; j++ ) + { + + // if data is the same as the test label add one to the catagory counter + if ( test_type == _base->CATEGORICAL && amx_column[j] == test_label ) + { + total_label_count++; + } + if ( ccmPair.at(clusterIndex, j) == 1 ) + { + cluster_size++; + if ( test_type == _base->CATEGORICAL && amx_column.at(j) == test_label ) + { + labels_in_cluster++; + } + } + if ( ccmPair.at(clusterIndex, j) != 1 && ccmPair.at(clusterIndex, j) != 9) + { + if ( test_type == _base->CATEGORICAL && amx_column.at(j) == test_label ) + { + labels_out_cluster++; + } + } + if ( ccmPair.at(clusterIndex, j) != 9 ) + { + total_non_missing++; + } + } + + // If there are no matching labels in this cluster then return. This + // could happen if the annotation matrix has all NAs for the cluster. + if ( labels_in_cluster == 0 ) + { + result = qQNaN(); + return; + } + + int sampleSize = _base->_emx->sampleSize(); + + + // Before bootstraping get the list of indexes that belong to this cluster. + // We will randomly select from these indexes. + int cluster_indexes[cluster_size]; + for ( int j = 0; j < sampleSize; j++ ) + { + if ( ccmPair.at(clusterIndex, j) == 1 ) + { + cluster_indexes[jx++] = j; + } + } + + // Initialize the uniform random number generator. + const gsl_rng_type * T; + gsl_rng * r; + gsl_rng_env_setup(); + T = gsl_rng_default; + r = gsl_rng_alloc(T); + + // Perform 30 iterations of the boostrap sampling. + int num_i = 30; + // Select 30 items for each iteration + int bs_t = 30; + // Holds the bootstrap average proportion. + double avg_prop = 0.0; + for ( int i = 0; i < num_i; i++ ) + { + // Keeps track of the number of successes for each iteration. + int ns = 0; + + // The gsl_ran_sample function randomly chooses samples with replacement from a list. + int chosen[bs_t]; + gsl_ran_sample(r, chosen, bs_t, cluster_indexes, cluster_size, sizeof(int)); + // Now count the number of randomly selected items from the cluster. + for ( int j = 0; j < bs_t; j++ ) + { + QString slabel = amx_column.at(chosen[j]); + if ( ccmPair.at(clusterIndex, chosen[j]) == 1 && amx_column.at(chosen[j]) == test_label ) + { + ns = ns + 1; + } + } + + avg_prop += static_cast(ns) / static_cast(bs_t); + } + + // Calculate the average proportion from all iterations + // and free the random number struct. + avg_prop = avg_prop / num_i; + gsl_rng_free(r); + + // First test. + // The null hypothesis for this test is that the cluster is + // comprised of equal or less of the desired label. This would + // happen if all labels have equal chance, or there is a bias + // against the label. + // Ho: avg_prop <= 0.5. The value 0.5 is chosen with the asumption + // that there is equal or less proportions of the desired label. + // Ha: avg_prop > 0.5 + double exp_prop = 0.5; + double z = (avg_prop - exp_prop) / sqrt((exp_prop * (1 - exp_prop)) / bs_t); + double pval1 = 1 - gsl_cdf_gaussian_P(z, 1); + + // Second, test. + // The null hypothesis is that the proportion of labels in the cluster + // is equal or less than the proportion out of the cluster. This would + // happen if all the labels have equal chance of being in the cluster or + // if there is bias against the label. + double obs_prop = (avg_prop * cluster_size) / static_cast(total_label_count); + obs_prop = std::min(1.0, obs_prop); + exp_prop = static_cast(total_label_count) / static_cast(num_samples); + z = (obs_prop - exp_prop) / sqrt((exp_prop * (1 - exp_prop)) / bs_t); + double pval2 = 1 - gsl_cdf_gaussian_P(z, 1); + + // Return the maximum p-value + result = std::max(pval1, pval2); +} /*! * Performs the regression test for quantitative data. @@ -316,7 +484,6 @@ void ConditionalTest::Serial::regression( const QVector& amx_column, const CCMatrix::Pair& ccmPair, int clusterIndex, - int featureIndex, QVector& results) { EDEBUG_FUNC(this, &amxInfo, &ccmPair, clusterIndex); @@ -345,6 +512,18 @@ void ConditionalTest::Serial::regression( } } + // Don't do regression analysis if there are fewer than 5 samples. + // This can occur if the annotation matrix has missing values + // for samples in the cluster. The Degrees of Freedom for the error + // is test_cluster_size - 4 so we need at least 5 samples. Odds + // are these tests will get filterd out anyway for low power. + if ( test_cluster_size < 5 ) + { + results[0] = qQNaN(); + results[1] = qQNaN(); + return; + } + // Allocate a matrix to hold the predictior variables, in this case the gene // Expression data. X = gsl_matrix_alloc(test_cluster_size, 4); @@ -363,6 +542,7 @@ void ConditionalTest::Serial::regression( geneX.read(ccmPair.index().getX()); geneY.read(ccmPair.index().getY()); + // Useful for debugging purposes. // QString g1Name = geneX.toString(); // QString g2Name = geneY.toString(); @@ -460,11 +640,37 @@ void ConditionalTest::Serial::regression( // Set the results array if ( qIsNaN(pValue) ) { - results[0] = 1; - results[1] = 0; + results[0] = qQNaN(); + results[1] = qQNaN(); } - else { - results[0] = pValue; - results[1] = R2; + else { + double power = pwr_f2_test(3, test_cluster_size - 4, R2, _base->_powerThresholdAlpha); + if ( power >= _base->_powerThresholdPower ) { + results[0] = pValue; + results[1] = R2; + } + else { + results[0] = std::numeric_limits::quiet_NaN(); + results[1] = std::numeric_limits::quiet_NaN(); + } } } + +/** + * @brief ConditionalTest::Serial::pwr_f2_test + * @param df1 degrees of freedom of the numerator: the number of coefficients in + * the model minus the intercept. + * @param df2 degrees of freedom of the denominator: the number of error degrees + * of freedom which is n - df1 - 1 + * @param f2 effect size + * @param sig_level signficance level + * @return + */ +double ConditionalTest::Serial::pwr_f2_test(int df1, int df2, double f2, double sig_level) +{ + double lambda = f2 * (df1 + df2 + 1); + double p = gsl_cdf_fdist_Qinv(sig_level, df1, df2); + boost::math::non_central_f_distribution<> ncfd(df1, df2, lambda); + double power = 1 - boost::math::cdf(ncfd, p); + return power; +} diff --git a/src/core/conditionaltest_serial.h b/src/core/conditionaltest_serial.h index 3334041..a710773 100644 --- a/src/core/conditionaltest_serial.h +++ b/src/core/conditionaltest_serial.h @@ -27,12 +27,19 @@ class ConditionalTest::Serial : public EAbstractAnalyticSerial int featureIndex, int labelIndex, double& results); - - void regression( + + void test_proportions( const QVector& amx_column, const CCMatrix::Pair& ccmPair, int clusterIndex, int featureIndex, + int labelIndex, + double& results); + + void regression( + const QVector& amx_column, + const CCMatrix::Pair& ccmPair, + int clusterIndex, QVector& results); private: @@ -40,6 +47,9 @@ class ConditionalTest::Serial : public EAbstractAnalyticSerial * Pointer to the base analytic for this object. */ ConditionalTest* _base; + + // Performs power analysis for multiple-linear regression. + double pwr_f2_test(int u, int v, double f2, double sig_level); }; diff --git a/src/core/corrpower.cpp b/src/core/corrpower.cpp index b7681d7..a32c7be 100644 --- a/src/core/corrpower.cpp +++ b/src/core/corrpower.cpp @@ -47,10 +47,16 @@ std::unique_ptr CorrPowerFilter::makeWork(int index) con // initialize pairwise iterator for cmx file CorrelationMatrix::Pair cmxPair(_cmx); + if (index == 0) + { + cmxPair.readFirst(); + } + else + { + cmxPair.read(Pairwise::Index(start)); + } - // iterate to the start index of the next work block - cmxPair.read(Pairwise::Index(start)); - + // Iterate to the start index of the next work block for ( qint64 i = 0; i < size; i++ ) { cmxPair.readNext(); @@ -59,6 +65,7 @@ std::unique_ptr CorrPowerFilter::makeWork(int index) con // save start index of next work block _workBlockStart = cmxPair.index().toRawIndex(); + // construct work block return std::unique_ptr(new WorkBlock(index, start, size)); } diff --git a/src/core/corrpower_serial.cpp b/src/core/corrpower_serial.cpp index e65bbb2..fbd61f6 100644 --- a/src/core/corrpower_serial.cpp +++ b/src/core/corrpower_serial.cpp @@ -59,10 +59,15 @@ std::unique_ptr CorrPowerFilter::Serial::execute(const E // Iterate through each pair in the work block. for ( qint64 wbIndex = 0; wbIndex < workBlock->size(); ++wbIndex ) { + // Print warning if iterator indices do not match if ( ccmPair.index() != cmxPair.index() ) { - qInfo() << "warning: ccm and cmx files are out of sync"; + QString source = _base->_cmx->geneNames().at(cmxPair.index().getX()).toString(); + QString target = _base->_cmx->geneNames().at(cmxPair.index().getY()).toString(); + qInfo() << "warning: ccm and cmx files are out of sync at cmx coordinate:" + << source << "," << target << " (" + << cmxPair.index().getX() << "," << cmxPair.index().getY() <<")."; } // Get the number of samples and clusters. @@ -128,7 +133,7 @@ std::unique_ptr CorrPowerFilter::Serial::execute(const E if ( new_correlations.size() > 0 ) { resultBlock->append(Pair { - ccmPair.index(), + cmxPair.index(), new_labels, new_correlations, k_keep @@ -136,8 +141,9 @@ std::unique_ptr CorrPowerFilter::Serial::execute(const E } // read the next pair - ccmPair.readNext(); cmxPair.readNext(); + ccmPair.read(cmxPair.index()); + } // We're done! Return the result block. diff --git a/src/core/extract_input.cpp b/src/core/extract_input.cpp index fe03144..8ecaac7 100644 --- a/src/core/extract_input.cpp +++ b/src/core/extract_input.cpp @@ -11,9 +11,10 @@ const QStringList Extract::Input::FORMAT_NAMES { "text" + ,"tidy" ,"minimal" ,"graphml" - ,"tidy" + }; diff --git a/src/core/extract_networkwriter.cpp b/src/core/extract_networkwriter.cpp index f047096..a8f5293 100644 --- a/src/core/extract_networkwriter.cpp +++ b/src/core/extract_networkwriter.cpp @@ -44,28 +44,34 @@ int Extract::NetworkWriter::readNext() { EDEBUG_FUNC(this,cmx_index); - // read the next pair in the cmx + // Read the next pair in the cmx. _cmxPair.readNext(); // read the next pair in the ccm (should always match cmx) if ( _ccm ) { - _ccmPair.readNext(); + _ccmPair.read(_cmxPair.index()); if ( _cmxPair.index() != _ccmPair.index() ) { - qInfo() << "warning: cmx and ccm are out of sync"; + qInfo() << "warning: cmx and ccm are out of sync at cmx coordinate:" + << getEdgeGene1() << "," << getEdgeGene2() << " (" + << _cmxPair.index().getX() << "," << _cmxPair.index().getY() <<")."; + readNext(); } } // read the next pair in the csm (should always match cmx) if ( _csm ) { - _csmPair.readNext(); + _csmPair.read(_cmxPair.index()); if ( _cmxPair.index() != _csmPair.index() ) { - qInfo() << "warning: cmx and ccm are out of sync"; + qInfo() << "warning: cmx and csm are out of sync at cmx coordinate:" + << getEdgeGene1() << "," << getEdgeGene2() << " (" + << _cmxPair.index().getX() << "," << _cmxPair.index().getY() <<") "; + readNext(); } } diff --git a/src/core/importexpressionmatrix_input.cpp b/src/core/importexpressionmatrix_input.cpp index e600b03..18023c0 100644 --- a/src/core/importexpressionmatrix_input.cpp +++ b/src/core/importexpressionmatrix_input.cpp @@ -86,7 +86,7 @@ QVariant ImportExpressionMatrix::Input::data(int index, Role role) const { case Role::CommandLineName: return QString("nan"); case Role::Title: return tr("NAN Token:"); - case Role::WhatsThis: return tr("Expected token for expressions that have no value."); + case Role::WhatsThis: return tr("The string that specifies the missing value in the annotation matrix (e.g. NA, 0, 0.0)."); case Role::Default: return "NA"; default: return QVariant(); } diff --git a/src/core/pairwise_matrix_pair.cpp b/src/core/pairwise_matrix_pair.cpp index c359eb2..b8ddf04 100644 --- a/src/core/pairwise_matrix_pair.cpp +++ b/src/core/pairwise_matrix_pair.cpp @@ -44,8 +44,10 @@ void Matrix::Pair::write(const Index& index) * Read the pair with the given pairwise index from the data object file. * * @param index + * + * @return the index of the pair. If no pair is found at this index then -1 is returned. */ -void Matrix::Pair::read(const Index& index) const +qint64 Matrix::Pair::read(const Index& index) const { EDEBUG_FUNC(this,&index); @@ -61,10 +63,30 @@ void Matrix::Pair::read(const Index& index) const _rawIndex = clusterIndex; readNext(); } + return clusterIndex; + } +/*! + * For sparse matricies the index of the first pair may not be know. + * + * This function is used to find that first pair and read it. + */ +void Matrix::Pair::readFirst() const +{ + EDEBUG_FUNC(this); + + for (qint64 i = 0; i < _cMatrix->size(); i++) + { + if (this->read(Pairwise::Index(i)) != -1) { + return; + } + } +} + + /*! * Read the next pair in the data object file. */ diff --git a/src/core/pairwise_matrix_pair.h b/src/core/pairwise_matrix_pair.h index af8cda6..9300a0a 100644 --- a/src/core/pairwise_matrix_pair.h +++ b/src/core/pairwise_matrix_pair.h @@ -32,8 +32,9 @@ namespace Pairwise virtual int clusterSize() const = 0; virtual bool isEmpty() const = 0; void write(const Index& index); - void read(const Index& index) const; + qint64 read(const Index& index) const; void reset() const { _rawIndex = 0; } + void readFirst() const; void readNext() const; bool hasNext() const { return _rawIndex != _cMatrix->_clusterSize; } const Index& index() const { return _index; }