{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Exercise"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "1. Load the data in thefile \"sample_data.tsv\" as a numpy array"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(100, 14)\n",
      "['Sample1_1', 'Sample1_2', 'Sample1_3', 'Sample1_4', 'Sample1_5', 'Sample1_6', 'Sample1_7', 'Sample1_8', 'Sample2_1', 'Sample2_2', 'Sample2_3', 'Sample2_4', 'Sample2_5', 'Sample2_6']\n",
      "[[1.53990329e+02 5.49317111e+02 4.74282865e+02 3.58728643e+02\n",
      "  1.47996534e+02 3.01555931e+02 1.41380311e+02 8.64979944e+01\n",
      "  3.00937251e+01 7.50011183e+00 3.96720466e+01 2.42987522e+00\n",
      "  9.50744452e-01 1.03655829e+01]\n",
      " [2.44049301e+02 2.59706275e+02 5.90912558e+01 9.32420957e+01\n",
      "  5.97136359e+02 7.23413019e+01 3.30480044e+02 6.94364713e+01\n",
      "  6.02520296e+01 7.23626417e+01 1.01605109e+02 5.43078039e+01\n",
      "  1.34286906e+01 9.50205155e+02]\n",
      " [2.27548806e+02 1.80711739e+02 1.85180260e+02 5.57781934e+01\n",
      "  3.34017493e+02 3.59707209e+02 2.59451290e+02 6.97516062e+02\n",
      "  3.01565490e+01 5.89768542e+00 1.72388953e+01 5.12154569e+01\n",
      "  2.45332757e+01 5.70640917e+00]\n",
      " [4.90098759e+01 3.73439768e+02 9.71132617e+01 2.09838816e+01\n",
      "  5.33975903e+01 6.01714245e+01 2.26648398e+02 9.83959895e+00\n",
      "  3.96142789e+00 3.52765606e+00 1.84289666e+00 6.01384530e-01\n",
      "  3.52595951e+00 3.99224871e+00]\n",
      " [1.22874477e+03 2.37489780e+02 7.49156874e+02 1.57231490e+03\n",
      "  3.45341020e+02 1.76990723e+02 1.69384604e+02 2.45829002e+03\n",
      "  4.05010885e+01 2.20874788e+01 8.21662250e+01 2.05111050e+01\n",
      "  2.07562236e+01 2.30045811e+01]]\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import scipy.stats as stats\n",
    "\n",
    "#the loading itself is not hard\n",
    "data = np.loadtxt(\"../exercises/sample_data.tsv\", dtype=np.float,delimiter='\\t', skiprows=1) #notice I skip the first row\n",
    "\n",
    "#I load the header separately\n",
    "IN = open(\"../exercises/sample_data.tsv\",'r')\n",
    "l = IN.readline() #read the first line\n",
    "header = l.strip().split('\\t') #split it to get the header\n",
    "IN.close()#no need to go further than that\n",
    "\n",
    "print( data.shape )\n",
    "print(header)\n",
    "#showing the first 5 lines \n",
    "print(data[0:5,])\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "2. Log-transform the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[ 7.26669594  9.10149542  8.88960394  8.48674913  7.20941957  8.2362818\n",
      "   7.14343741  6.43459478  4.91139079  2.90691211  5.31005092  1.28088223\n",
      "  -0.07287048  3.37372934]\n",
      " [ 7.93102881  8.02073706  5.88487275  6.54290953  9.2219166   6.17674766\n",
      "   8.36841935  6.11762173  5.91293793  6.17717317  6.66682913  5.76308762\n",
      "   3.74724673  9.89209522]\n",
      " [ 7.83003221  7.49754642  7.53278651  5.8016293   8.38377985  8.49067926\n",
      "   8.0193199   9.44608263  4.91439943  2.56014887  4.10759542  5.67850738\n",
      "   4.61666797  2.5125832 ]\n",
      " [ 5.61500059  8.54473176  6.60159642  4.39120967  5.73870273  5.91100661\n",
      "   7.82431216  3.29859951  1.98602054  1.81870991  0.88197517 -0.73364034\n",
      "   1.81801591  1.9972016 ]\n",
      " [10.26296956  7.89172162  9.54912404 10.61867447  8.4318779   7.46752993\n",
      "   7.40415894 11.26343941  5.33988878  4.46515684  6.36047358  4.35833331\n",
      "   4.37547207  4.52384928]]\n"
     ]
    }
   ],
   "source": [
    "log_data = np.log2(data)\n",
    "\n",
    "print(log_data[0:5,])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "3. Find the row-wise means for replicates of Sample1 and Sample2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Sample1_1', 'Sample1_2', 'Sample1_3', 'Sample1_4', 'Sample1_5', 'Sample1_6', 'Sample1_7', 'Sample1_8', 'Sample2_1', 'Sample2_2', 'Sample2_3', 'Sample2_4', 'Sample2_5', 'Sample2_6']\n",
      "sample 1\n",
      "[[7.26669594 9.10149542 8.88960394 8.48674913 7.20941957 8.2362818\n",
      "  7.14343741 6.43459478]\n",
      " [7.93102881 8.02073706 5.88487275 6.54290953 9.2219166  6.17674766\n",
      "  8.36841935 6.11762173]]\n",
      "sample 2\n",
      "[[ 4.91139079  2.90691211  5.31005092  1.28088223 -0.07287048  3.37372934]\n",
      " [ 5.91293793  6.17717317  6.66682913  5.76308762  3.74724673  9.89209522]]\n"
     ]
    }
   ],
   "source": [
    "print(header)\n",
    "#first, I will create lists containing the indexes of columns for both samples\n",
    "sample1_columns = []\n",
    "sample2_columns = []\n",
    "for i , column_name in enumerate(header):\n",
    "    if column_name.startswith('Sample1'):\n",
    "        sample1_columns.append(i)\n",
    "    elif column_name.startswith('Sample2'):\n",
    "        sample2_columns.append(i)\n",
    "\n",
    "print('sample 1')\n",
    "print(log_data[0:2,sample1_columns])\n",
    "print('sample 2')\n",
    "print(log_data[0:2,sample2_columns])\n",
    "\n",
    "\n",
    "#now, it is fairly easy to get the row-wise means\n",
    "meanSample1 = np.mean(log_data[:,sample1_columns], axis=1)\n",
    "meanSample2 = np.mean(log_data[:,sample2_columns], axis=1)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "4. Find the row-wise standard deviations the same way as means"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0.8982444  1.17186414 0.9802752  1.59197928 1.41364824 0.99993551\n",
      " 1.88629293 1.50467958 1.18511367 1.05330335 1.83367544 1.40746776\n",
      " 1.36265871 1.26695411 1.16673621 1.28563905 1.22944372 1.68667408\n",
      " 1.15295969 1.63616738 0.91075652 0.97800572 1.44365928 1.63632368\n",
      " 0.88637574 0.93675936 1.93806336 1.14017755 1.05693079 1.29922139\n",
      " 1.65824749 1.64965071 0.85621216 1.05817743 1.12967202 0.72193852\n",
      " 1.01901309 1.7850736  1.15023303 1.62521192 1.99717997 1.7163397\n",
      " 1.33497956 1.00729917 1.61804379 1.17901292 1.22221936 1.37861907\n",
      " 1.34840682 0.98922408 0.90749128 1.51667316 1.26581749 1.79348312\n",
      " 1.81933228 1.43390095 1.54297492 1.11710552 1.12289861 1.59204195\n",
      " 1.6647315  1.1377631  1.1699277  1.08745105 1.34933699 1.2712262\n",
      " 1.16740293 1.17204402 1.20908614 0.77085071 1.36935127 1.04935491\n",
      " 1.178957   1.16375856 1.37882259 1.28504204 0.8569876  1.66829398\n",
      " 0.74458413 1.54247934 1.66462842 1.39386413 0.59091843 1.92974744\n",
      " 1.21940955 1.78749535 1.43654626 1.23092372 1.61243394 1.27934399\n",
      " 1.00650362 1.3510669  1.46403519 1.03406454 1.27197815 1.34001868\n",
      " 1.36328911 0.88681634 0.91910066 0.96301079]\n"
     ]
    }
   ],
   "source": [
    "stdSample1 = np.std(log_data[:,sample1_columns], axis=1)\n",
    "stdSample2 = np.std(log_data[:,sample2_columns], axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "5. Use a function *scipy.stats.ttest_ind* to calculate p-value for every row"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "TTest_pValues = stats.ttest_ind(log_data[:,sample1_columns], log_data[:,sample2_columns], axis=1, equal_var=False).pvalue"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "6. Select p-values which are smaller than $10^{-2}$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
    "significant = TTest_pValues < 10**-2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "7. Print how many P-values below $10^{-2}$ are found"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "there are 74 p-values <0.01.\n"
     ]
    }
   ],
   "source": [
    "print( 'there are',sum(significant),'p-values <0.01.')\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}