User Tools

Site Tools


study:data_analysis:incanter

Preparing for and Performing Statistical Data Analysis with Incanter

generating summary statistics with $rollup

all_160.P3.csv

(ns chap7.section1
  (:require [incanter.core :as i]
            [incanter.io]
            [incanter.stats :as s]))
 
(def data-file "data/all_160.P3.csv")
(def census (incanter.io/read-dataset data-file :header true))
 
(i/$rollup :mean :POP100 :STATE census)
 
;=> [:STATE :POP100]
;=> [29 695433/172]
;=> [28 812159/181]
;=> [31 70836/29]
;=> [30 341847/182]
;=> [25 2307701/122]
;=> [24 4750167/518]
;=> [56 437432/203]
;=> [27 726932/151]
;=> ...
 
(i/$rollup s/sd :POP100 :STATE census)
 
;=> [:STATE :POP100]
;=> [29 20135.438882226237]
;=> [28 11948.001546221063]
;=> [31 20443.062360819258]
;=> [30 7846.878660629904]
;=> [25 46374.42736719484]
;=> [24 30175.73504680357]
;=> ...

$rollup

($rollup summary-fun col-name group-by data)

summary-fun으로 집계한 dataset을 리턴한다.

:max -- the maximum value of the data in each group
:min -- the minimum value of the data in each group
:sum -- the sum of the data in each group
:count -- the number of elements in each group
:mean -- the mean of the data in each group

sd

표준편차(standard deviation)를 구한다.

defferencing variables to show changes

all_160_in_51.P3.csv

(ns chap7.section2
  (:require [incanter.core :as i]
            [incanter.io]))
 
(def data-file "data/all_160_in_51.P3.csv")
(def data (incanter.io/read-dataset data-file :header true))
 
(defn replace-empty [x] (if (empty? (str x)) 0 x))
 
(def growth-rates
  (->> data
       (i/$map replace-empty :POP100.2000)
       (i/minus (i/sel data :cols :POP100))
       (i/dataset [:POP.DELTA])
       (i/conj-cols data)))
 
(i/sel growth-rates
       :cols
       [:NAME :POP100 :POP100.2000 :POP.DELTA]
       :rows
       (range 5))
 
;=> ["Abingdon town" 8191 7780 411]
;=> ["Accomac town" 519 547 -28]
;=> ["Alberta town" 298 306 -8]
;=> ["Alexandria city" 139966 128283 11683]
;=> ["Allisonia CDP" 117 "" 117]

증가량을 계산하기 위해 i/minus를 사용한다. 숫자간 연산이기 때문에 replace-empty 함수로 빈 문자열을 0으로 치환. 이 증가량으로 POP.DELTA 컬럼을 만든 후 기존 데이터와 conj-cols 함수로 결합한다.

scaling variables to simplify variable relationships

all_160_in_51.P3.csv

(ns chap7.section3
  (:require [incanter.core :as i]
            [incanter.io]
            [incanter.charts :as c]))
 
(def data-file "data/all_160_in_51.P3.csv")
(def data
  (i/$order :POP100
            :asc
            (incanter.io/read-dataset data-file :header true)))
 
(def data
  (->> (i/div (i/sel data :cols :POP100) 1000.0)
       (i/dataset [:POP100.1000])
       (i/conj-cols data)))
 
(def data
  (->> (i/sel data :cols :POP100)
       i/log10
       (i/dataset [:POP100.1000])
       (i/conj-cols data)))

working with time series data with incanter zoo

ibm.csv

(ns chap7.section4
  (:require [incanter.core :as i]
            [incanter.io]
            [incanter.zoo :as zoo]
            [clj-time.format :as tf])
  (:import java.util.Locale))
 
(def data-file "data/ibm.csv")
; 한글 사용 환경일 경우 locale을 바꿔야지 29-Nov-12 와 같은 문자열을 parse할 수 있다.
(def ^:dynamic *formatter* (tf/with-locale (tf/formatter "dd-MMM-yy") Locale/ENGLISH))
 
(defn parse-date [date] (tf/parse *formatter* date))
 
; :header가 기본적으로 false인데, header가 안 잘리는 문제가 발생. skip으로 첫번째 row를 건너뛴다.
(def data
  (i/with-data (i/col-names (incanter.io/read-dataset data-file :skip 1) 
                            [:date-str :open :high :low :close :volumn])
    (->> (i/$map parse-date :date-str)
         (i/dataset [:date])
         (i/conj-cols i/$data))))
(def data-zoo (zoo/zoo data :date))
(def data-roll5
  (->> (i/sel data-zoo :cols :close)
       (zoo/roll-mean 5)
       (i/dataset [:five-day])
       (i/conj-cols data-zoo)))
 
;=> [:index :volumn :close :low :high :open :date-str :five-day]
;=> [#<DateTime 2001-11-26T00:00:00.000Z> 4808500 116.33 115.2 116.4 115.9 "26-Nov-01" 114.54]
;=> [#<DateTime 2001-11-27T00:00:00.000Z> 6881200 114.2 114.07 116.3 116.25 "27-Nov-01" 114.1]
;=> [#<DateTime 2001-11-28T00:00:00.000Z> 6123800 112.15 112.1 114.38 114.1 "28-Nov-01" 114.58800000000001]
;=> [#<DateTime 2001-11-29T00:00:00.000Z> 6062100 114.43 111.81 114.55 112.9 "29-Nov-01" 116.43800000000002]

zoo

This is a port of Zoo from R in order to create the basis of a library for time series data.
(zoo x index-col)

zoo/roll-mean

Returns the unweighted mean of the previous n data points.
(roll-mean n coll)

smoothing variables to decrease noise

pg1661.txt

(ns chap7.section5
  (:require [incanter.core :as i]
            [incanter.stats :as s]
            [incanter.charts :as c]
            [clojure.string :as str]))
 
(defn tokenize [text]
  (map str/lower-case (re-seq #"\w+" text)))
 
(defn count-hits [x coll]
  (get (frequencies coll) x 0))
 
(def data-file "data/pg1661.txt")
(def windows (partition 500 250 (tokenize (slurp data-file))))
; (partition 4 2 "12345678")
;=> ((\1 \2 \3 \4) (\3 \4 \5 \6) (\5 \6 \7 \8))
(def baker-hits (map (partial count-hits "baker") windows))
 
(defn rolling-fn [f n coll]
  (map f (partition n 1 coll)))
 
(def baker-avgs (rolling-fn s/mean 10 baker-hits))

validating sample statistics with bootstrapping

all_160_in_51.P3.csv

(ns chap7.section6
  (:require [incanter.core :as i]
            [incanter.stats :as s]
            [incanter.io]
            [incanter.charts :as c]))
 
(def data-file "data/all_160_in_51.P3.csv")
(def data (incanter.io/read-dataset data-file :header true))
(def pop100 (i/sel data :cols :POP100))
(def samples (s/bootstrap pop100 s/median :size 2000))
 
(i/view (c/histogram samples))

median

(s/mean [1 2 3 4 100])
;=> 22.0
(s/median [1 2 3 4 100])
;=> 3.0
  • 평균이 아닌 중간값.
  • 정렬한 다음 중간에 있는 값을 취한다.
  • 평균에 영향을 주는 아주 큰 값을 걸러낼 수 있다. 그룹 대표값으로 사용.

bootstrap

modeling linear relationships

all_160_in_51.P35.csv

(ns chap7.section7
  (:require [incanter.core :as i]
            [incanter.stats :as s]
            [incanter.io]
            [incanter.charts :as c]))
 
(def data-file "data/all_160_in_51.P35.csv")
(def family-data (incanter.io/read-dataset data-file :header true))
(def housing (i/sel family-data :cols [:HU100]))
(def families (i/sel family-data :cols [:P035001]))
(def families-lm (s/linear-model housing families :intercept false))
(def housing-chart
  (doto
    (c/scatter-plot families
                    housing
                    :title "Relationship of Housing to Families"
                    :x-label "Families"
                    :y-label "Housing"
                    :legend true)
    (c/add-lines families
                 (:fitted families-lm)
                 :series-label "Linear Model")
    (i/view)))

linear-model

modeling non-linear relationships

accident-fatalities.tsv

(ns chap7.section8
  (:require [incanter.core :as i]
            [incanter.stats :as s]
            [incanter.io]
            [incanter.optimize :as o]
            [incanter.charts :as c])
  (:import [java.lang Math]))
 
(def data-file "data/accident-fatalities.tsv")
(def data (incanter.io/read-dataset data-file :header true :delim \tab))
(def fatalities
  (->> data
       (i/$rollup :count :0bs. :spdlim)
       (i/$where {:spdlim {:$ne "."}})
       (i/$where {:spdlim {:$ne 0}})
       (i/$order :spdlim :asc)
       (i/to-list)
       (i/dataset [:speed-limit :fatalities])))
(def speed-limit
  (i/sel fatalities :cols :speed-limit))
(def fatality-count
  (i/sel fatalities :cols :fatalities))
(def chart
  (doto
    (c/scatter-plot speed-limit
                    fatality-count
                    :title "Fatalities by Speed Limit (2010)"
                    :x-label "Speed Limit"
                    :y-label "Fatality Count"
                    :legend true)
    (i/view)))
 
(defn sine-wave [theta x]
  (let [[amp ang-freq phase shift] theta]
    (i/plus (i/mult amp (i/sin (i/plus (i/mult ang-freq x) phase)))
            shift)))
 
(def start [3500.0 0.07 Math/PI 2500.0])
(def nlm
  (o/non-linear-model sine-wave
                      fatality-count
                      speed-limit
                      start))
 
(-> chart
    (c/add-lines speed-limit
                 (sine-wave start speed-limit))
    (c/add-lines speed-limit
                 (:fitted nlm)))

non-linear-model

modeling multimodal bayesian distributions

all_160_in_51.P3.csv

(ns chap7.section9
  (:require [incanter.core :as i]
            [incanter.stats :as s]
            [incanter.io]
            [incanter.charts :as c]
            [incanter.bayes :as b]))
 
(def census-race
  (i/col-names (incanter.io/read-dataset "data/all_160_in_51.P3.csv" :header true)
               [:geoid :sumlev :state :county :cbsa :csa :necta :cnecta :name
                :pop :pop2k :housing :housing2k :total :total2k :white :white2k
                :black :black2k :indian :indian2k :asian :asian2k :hawaiian :hawaiian2k
                :other :other2k :multiple :multiple2k]))
(def census-sample
  (->> census-race
       i/to-list
       shuffle
       (take 60)
       (i/dataset (i/col-names census-race))))
(def race-keys
  [:white :black :indian :asian :hawaiian :other :multiple])
(def race-totals
  (into {}
        (map #(vector % (i/sum (i/$ % census-sample)))
             race-keys)))
(def y (map second (sort race-totals)))
(def theta (b/sample-multinomial-params 2000 y))
(def theta-params
  (into {}
        (map #(vector %1 (i/sel theta :cols %2))
             (sort race-keys)
             (range))))
 
(i/view (c/histogram (:black theta-params)))

sample-multinomial-params

finding data errors with benford's law

all_160_in_51.P35.csv

(ns chap7.section10
  (:require [incanter.core :as i]
            [incanter.stats :as s]
            [incanter.charts :as c]
            [incanter.io]))
 
(def data-file "data/all_160_in_51.P35.csv")
(def data (incanter.io/read-dataset data-file :header true))
(def bt (s/benford-test (i/sel data :cols :POP100)))
(def chart
  (let [digits (map inc (:row-levels bt))
        frequency (:table bt)]
    (doto
        (c/bar-chart digits frequency)
        (i/view))))

benford-test

project file

study/data_analysis/incanter.txt · Last modified: 2019/02/04 14:26 (external edit)