{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Exercise" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "1. Load the data in thefile \"sample_data.tsv\" as a numpy array" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(100, 14)\n", "['Sample1_1', 'Sample1_2', 'Sample1_3', 'Sample1_4', 'Sample1_5', 'Sample1_6', 'Sample1_7', 'Sample1_8', 'Sample2_1', 'Sample2_2', 'Sample2_3', 'Sample2_4', 'Sample2_5', 'Sample2_6']\n", "[[1.53990329e+02 5.49317111e+02 4.74282865e+02 3.58728643e+02\n", " 1.47996534e+02 3.01555931e+02 1.41380311e+02 8.64979944e+01\n", " 3.00937251e+01 7.50011183e+00 3.96720466e+01 2.42987522e+00\n", " 9.50744452e-01 1.03655829e+01]\n", " [2.44049301e+02 2.59706275e+02 5.90912558e+01 9.32420957e+01\n", " 5.97136359e+02 7.23413019e+01 3.30480044e+02 6.94364713e+01\n", " 6.02520296e+01 7.23626417e+01 1.01605109e+02 5.43078039e+01\n", " 1.34286906e+01 9.50205155e+02]\n", " [2.27548806e+02 1.80711739e+02 1.85180260e+02 5.57781934e+01\n", " 3.34017493e+02 3.59707209e+02 2.59451290e+02 6.97516062e+02\n", " 3.01565490e+01 5.89768542e+00 1.72388953e+01 5.12154569e+01\n", " 2.45332757e+01 5.70640917e+00]\n", " [4.90098759e+01 3.73439768e+02 9.71132617e+01 2.09838816e+01\n", " 5.33975903e+01 6.01714245e+01 2.26648398e+02 9.83959895e+00\n", " 3.96142789e+00 3.52765606e+00 1.84289666e+00 6.01384530e-01\n", " 3.52595951e+00 3.99224871e+00]\n", " [1.22874477e+03 2.37489780e+02 7.49156874e+02 1.57231490e+03\n", " 3.45341020e+02 1.76990723e+02 1.69384604e+02 2.45829002e+03\n", " 4.05010885e+01 2.20874788e+01 8.21662250e+01 2.05111050e+01\n", " 2.07562236e+01 2.30045811e+01]]\n" ] } ], "source": [ "import numpy as np\n", "import scipy.stats as stats\n", "\n", "#the loading itself is not hard\n", "data = np.loadtxt(\"../exercises/sample_data.tsv\", dtype=np.float,delimiter='\\t', skiprows=1) #notice I skip the first row\n", "\n", "#I load the header separately\n", "IN = open(\"../exercises/sample_data.tsv\",'r')\n", "l = IN.readline() #read the first line\n", "header = l.strip().split('\\t') #split it to get the header\n", "IN.close()#no need to go further than that\n", "\n", "print( data.shape )\n", "print(header)\n", "#showing the first 5 lines \n", "print(data[0:5,])\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "2. Log-transform the data" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[ 7.26669594 9.10149542 8.88960394 8.48674913 7.20941957 8.2362818\n", " 7.14343741 6.43459478 4.91139079 2.90691211 5.31005092 1.28088223\n", " -0.07287048 3.37372934]\n", " [ 7.93102881 8.02073706 5.88487275 6.54290953 9.2219166 6.17674766\n", " 8.36841935 6.11762173 5.91293793 6.17717317 6.66682913 5.76308762\n", " 3.74724673 9.89209522]\n", " [ 7.83003221 7.49754642 7.53278651 5.8016293 8.38377985 8.49067926\n", " 8.0193199 9.44608263 4.91439943 2.56014887 4.10759542 5.67850738\n", " 4.61666797 2.5125832 ]\n", " [ 5.61500059 8.54473176 6.60159642 4.39120967 5.73870273 5.91100661\n", " 7.82431216 3.29859951 1.98602054 1.81870991 0.88197517 -0.73364034\n", " 1.81801591 1.9972016 ]\n", " [10.26296956 7.89172162 9.54912404 10.61867447 8.4318779 7.46752993\n", " 7.40415894 11.26343941 5.33988878 4.46515684 6.36047358 4.35833331\n", " 4.37547207 4.52384928]]\n" ] } ], "source": [ "log_data = np.log2(data)\n", "\n", "print(log_data[0:5,])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "3. Find the row-wise means for replicates of Sample1 and Sample2" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Sample1_1', 'Sample1_2', 'Sample1_3', 'Sample1_4', 'Sample1_5', 'Sample1_6', 'Sample1_7', 'Sample1_8', 'Sample2_1', 'Sample2_2', 'Sample2_3', 'Sample2_4', 'Sample2_5', 'Sample2_6']\n", "sample 1\n", "[[7.26669594 9.10149542 8.88960394 8.48674913 7.20941957 8.2362818\n", " 7.14343741 6.43459478]\n", " [7.93102881 8.02073706 5.88487275 6.54290953 9.2219166 6.17674766\n", " 8.36841935 6.11762173]]\n", "sample 2\n", "[[ 4.91139079 2.90691211 5.31005092 1.28088223 -0.07287048 3.37372934]\n", " [ 5.91293793 6.17717317 6.66682913 5.76308762 3.74724673 9.89209522]]\n" ] } ], "source": [ "print(header)\n", "#first, I will create lists containing the indexes of columns for both samples\n", "sample1_columns = []\n", "sample2_columns = []\n", "for i , column_name in enumerate(header):\n", " if column_name.startswith('Sample1'):\n", " sample1_columns.append(i)\n", " elif column_name.startswith('Sample2'):\n", " sample2_columns.append(i)\n", "\n", "print('sample 1')\n", "print(log_data[0:2,sample1_columns])\n", "print('sample 2')\n", "print(log_data[0:2,sample2_columns])\n", "\n", "\n", "#now, it is fairly easy to get the row-wise means\n", "meanSample1 = np.mean(log_data[:,sample1_columns], axis=1)\n", "meanSample2 = np.mean(log_data[:,sample2_columns], axis=1)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "4. Find the row-wise standard deviations the same way as means" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[0.8982444 1.17186414 0.9802752 1.59197928 1.41364824 0.99993551\n", " 1.88629293 1.50467958 1.18511367 1.05330335 1.83367544 1.40746776\n", " 1.36265871 1.26695411 1.16673621 1.28563905 1.22944372 1.68667408\n", " 1.15295969 1.63616738 0.91075652 0.97800572 1.44365928 1.63632368\n", " 0.88637574 0.93675936 1.93806336 1.14017755 1.05693079 1.29922139\n", " 1.65824749 1.64965071 0.85621216 1.05817743 1.12967202 0.72193852\n", " 1.01901309 1.7850736 1.15023303 1.62521192 1.99717997 1.7163397\n", " 1.33497956 1.00729917 1.61804379 1.17901292 1.22221936 1.37861907\n", " 1.34840682 0.98922408 0.90749128 1.51667316 1.26581749 1.79348312\n", " 1.81933228 1.43390095 1.54297492 1.11710552 1.12289861 1.59204195\n", " 1.6647315 1.1377631 1.1699277 1.08745105 1.34933699 1.2712262\n", " 1.16740293 1.17204402 1.20908614 0.77085071 1.36935127 1.04935491\n", " 1.178957 1.16375856 1.37882259 1.28504204 0.8569876 1.66829398\n", " 0.74458413 1.54247934 1.66462842 1.39386413 0.59091843 1.92974744\n", " 1.21940955 1.78749535 1.43654626 1.23092372 1.61243394 1.27934399\n", " 1.00650362 1.3510669 1.46403519 1.03406454 1.27197815 1.34001868\n", " 1.36328911 0.88681634 0.91910066 0.96301079]\n" ] } ], "source": [ "stdSample1 = np.std(log_data[:,sample1_columns], axis=1)\n", "stdSample2 = np.std(log_data[:,sample2_columns], axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "5. Use a function *scipy.stats.ttest_ind* to calculate p-value for every row" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [], "source": [ "TTest_pValues = stats.ttest_ind(log_data[:,sample1_columns], log_data[:,sample2_columns], axis=1, equal_var=False).pvalue" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "6. Select p-values which are smaller than $10^{-2}$" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [], "source": [ "significant = TTest_pValues < 10**-2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "7. Print how many P-values below $10^{-2}$ are found" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "there are 74 p-values <0.01.\n" ] } ], "source": [ "print( 'there are',sum(significant),'p-values <0.01.')\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 2 }