# Logistic regression- One predictor
# by Aaron Albin
# Script began: 3/13/2012
###########################
# WHEN TO USE THIS SCRIPT #
###########################
# Use this script if you can answer 'yes' to all of the following questions:
# A) The predicted variable in your dataset is binary (AKA 'dichotomous'), i.e. Yes/No, True/False, Accept/Reject, etc., and this is stored in one column of your dataframe.
# B) The predictor variable in your dataset is continuous-numeric (i.e. numbers with decimal places), and this is stored in another separate column.
# C) You are interested determining whether a change in your predictor variable (more or less on the continuous scale) has a systematic effect on your binary predicted variable (in terms of raising or lowering the proportion of 'Yes'es as opposed to 'No's, etc.).
# Examples:
# * Predicting whether a hospital patient does or does not have some sort of health-related problem based on their blood pressure.
# * Predicting whether a person is male or female based on their height
###############
# SELECT DATA #
###############
# Run this to have R tell you the names for each of the columns that it ended up with:
#=====[1]=====
colnames(Dataframe)
#=============
# Pick out which of these columns contains your binary predicted (AKA 'dependent'/'response') variable and type the name of that column between the double-quotes below.
# NOTE: Make sure you have only two values in this column! The analysis will not work correctly if you have more than two.
# The binary information can be stored in any format - numbers (like '0' vs '1', or '1' vs '2') or characters (like 'yes' vs 'no' or 'TRUE' vs 'FALSE'). This script will be able to handle any such format.
#=====[2]=====
Name_BinaryPredictedVariable = "BinaryResponseColumn"
#Name_BinaryPredictedVariable="CONSONAT"
#=============
# Now do the same for your continuous-numeric predictor variable (AKA 'independent variable'), typing the name of that column between the double-quotes below.
#=====[3]=====
Name_PredictorVariable = "NumberColumn"
#Name_PredictorVariable="F1"
#=============
# Now run this to extract out the specific columns data you selected
#=====[4]=====
BinaryPredictedVariable = as.factor(Dataframe[,Name_BinaryPredictedVariable])
PredictorVariable = Dataframe[,Name_PredictorVariable]
#=============
###########################
# SET UP AND RUN ANALYSIS #
###########################
# Run these two lines of code to conduct the analysis:
#=====[5]=====
FittedModel = glm(formula=BinaryPredictedVariable ~ PredictorVariable, family = binomial(link="logit"), data=Dataframe, start = NULL, model = TRUE, method = "glm.fit", x = TRUE, y = TRUE, contrasts = NULL) # Fits a linear model to the data using iteratively reweighted least squares
ResultsSummary = summary.glm(object=FittedModel, dispersion=NULL, correlation=TRUE) # Obtains a summary of the results of the the Generalized Linear Model fit just conducted
#=============
#######################
# INSPECT THE RESULTS #
#######################
# Run the following code for a reformatted, easy-to-read representation of the results:
#=====[6]=====
cat("\n\nMODEL USED\n~~~~~~~~~~\n\nA simple logistic regression was run (using a ", FittedModel$family$family, " error distribution\n and ", FittedModel$family$link, " link function), modeling the influence that ",Name_PredictorVariable," has\n on the proportion of '", levels(BinaryPredictedVariable)[2], "' (as opposed to '", levels(BinaryPredictedVariable)[1],"') in ", Name_BinaryPredictedVariable,".\n\n\nRESULTS\n~~~~~~~\n\nWhen ", Name_PredictorVariable, " is 0, the probability of getting '", levels(BinaryPredictedVariable)[2],"' is\n approximately ", exp(ResultsSummary$coefficients["(Intercept)","Estimate"]), " (standard error ranging from ", max(0,exp(ResultsSummary$coefficients["(Intercept)","Estimate"]-ResultsSummary$coefficients["(Intercept)","Std. Error"])), " to ", min(exp(ResultsSummary$coefficients["(Intercept)","Estimate"]+ResultsSummary$coefficients["(Intercept)","Std. Error"]),1),").\n z=",ResultsSummary$coefficients["(Intercept)","z value"], ", p=", ResultsSummary$coefficients["(Intercept)","Pr(>|z|)"], "\n\nFor every increase of +1 in ", Name_PredictorVariable, ", the probability of getting '", levels(BinaryPredictedVariable)[2],"' changes by\n approximately ", 100*(exp(ResultsSummary$coefficients["PredictorVariable","Estimate"])-1), " percent (standard error ranging from\n ", 100*(exp(ResultsSummary$coefficients["PredictorVariable","Estimate"]-ResultsSummary$coefficients["PredictorVariable","Std. Error"])-1), " to ", 100*(exp(ResultsSummary$coefficients["PredictorVariable","Estimate"]+ResultsSummary$coefficients["PredictorVariable","Std. Error"])-1)," percent).\n z=",ResultsSummary$coefficients["PredictorVariable","z value"], ", p=", ResultsSummary$coefficients["PredictorVariable","Pr(>|z|)"], "\n\nAs a whole, this model explains ",signif(100*(1 - (FittedModel$deviance/FittedModel$null.deviance)),2),"% of the variance in ",Name_BinaryPredictedVariable,".\n Null deviance: ", FittedModel$null.deviance,"\n Residual deviance: ", FittedModel$deviance,"\n Akaike Information Criterion: ", FittedModel$aic,"\n\n\n",sep="")
#=============
# Here is R's default display of the results:
#=====[7]=====
print(ResultsSummary, digits=4, signif.stars=TRUE, symbolic.cor=FALSE)
#=============