{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "---\n", "\n", "_You are currently looking at **version 1.0** of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the [Jupyter Notebook FAQ](https://www.coursera.org/learn/python-data-analysis/resources/0dhYG) course resource._\n", "\n", "---" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Distributions in Pandas" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "np.random.binomial(1, 0.5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "np.random.binomial(1000, 0.5)/1000" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "chance_of_tornado = 0.01/100\n", "np.random.binomial(100000, chance_of_tornado)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "chance_of_tornado = 0.01\n", "\n", "tornado_events = np.random.binomial(1, chance_of_tornado, 1000000)\n", " \n", "two_days_in_a_row = 0\n", "for j in range(1,len(tornado_events)-1):\n", " if tornado_events[j]==1 and tornado_events[j-1]==1:\n", " two_days_in_a_row+=1\n", "\n", "print('{} tornadoes back to back in {} years'.format(two_days_in_a_row, 1000000/365))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "np.random.uniform(0, 1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "np.random.normal(0.75)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Formula for standard deviation\n", "$$\\sqrt{\\frac{1}{N} \\sum_{i=1}^N (x_i - \\overline{x})^2}$$" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "distribution = np.random.normal(0.75,size=1000)\n", "\n", "np.sqrt(np.sum((np.mean(distribution)-distribution)**2)/len(distribution))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [], "source": [ "np.std(distribution)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import scipy.stats as stats\n", "stats.kurtosis(distribution)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "stats.skew(distribution)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "chi_squared_df2 = np.random.chisquare(2, size=10000)\n", "stats.skew(chi_squared_df2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "chi_squared_df5 = np.random.chisquare(5, size=10000)\n", "stats.skew(chi_squared_df5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "%matplotlib inline\n", "import matplotlib\n", "import matplotlib.pyplot as plt\n", "\n", "output = plt.hist([chi_squared_df2,chi_squared_df5], bins=50, histtype='step', \n", " label=['2 degrees of freedom','5 degrees of freedom'])\n", "plt.legend(loc='upper right')\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Hypothesis Testing" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df = pd.read_csv('grades.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "len(df)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "early = df[df['assignment1_submission'] <= '2015-12-31']\n", "late = df[df['assignment1_submission'] > '2015-12-31']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "early.mean()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "late.mean()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from scipy import stats\n", "stats.ttest_ind?" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "stats.ttest_ind(early['assignment1_grade'], late['assignment1_grade'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "stats.ttest_ind(early['assignment2_grade'], late['assignment2_grade'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "stats.ttest_ind(early['assignment3_grade'], late['assignment3_grade'])" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 0 }