User Tools

Site Tools


Sidebar

  • Learn about Wiki
  • Lectures
  • Study
  • Tips
    • study:data_analysis:incanter

      Preparing for and Performing Statistical Data Analysis with Incanter

      generating summary statistics with $rollup

      all_160.P3.csv

      (ns chap7.section1
        (:require [incanter.core :as i]
                  [incanter.io]
                  [incanter.stats :as s]))
       
      (def data-file "data/all_160.P3.csv")
      (def census (incanter.io/read-dataset data-file :header true))
       
      (i/$rollup :mean :POP100 :STATE census)
       
      ;=> [:STATE :POP100]
      ;=> [29 695433/172]
      ;=> [28 812159/181]
      ;=> [31 70836/29]
      ;=> [30 341847/182]
      ;=> [25 2307701/122]
      ;=> [24 4750167/518]
      ;=> [56 437432/203]
      ;=> [27 726932/151]
      ;=> ...
       
      (i/$rollup s/sd :POP100 :STATE census)
       
      ;=> [:STATE :POP100]
      ;=> [29 20135.438882226237]
      ;=> [28 11948.001546221063]
      ;=> [31 20443.062360819258]
      ;=> [30 7846.878660629904]
      ;=> [25 46374.42736719484]
      ;=> [24 30175.73504680357]
      ;=> ...

      $rollup

      ($rollup summary-fun col-name group-by data)

      summary-fun으로 집계한 dataset을 리턴한다.

      :max -- the maximum value of the data in each group
      :min -- the minimum value of the data in each group
      :sum -- the sum of the data in each group
      :count -- the number of elements in each group
      :mean -- the mean of the data in each group

      sd

      표준편차(standard deviation)를 구한다.

      defferencing variables to show changes

      all_160_in_51.P3.csv

      (ns chap7.section2
        (:require [incanter.core :as i]
                  [incanter.io]))
       
      (def data-file "data/all_160_in_51.P3.csv")
      (def data (incanter.io/read-dataset data-file :header true))
       
      (defn replace-empty [x] (if (empty? (str x)) 0 x))
       
      (def growth-rates
        (->> data
             (i/$map replace-empty :POP100.2000)
             (i/minus (i/sel data :cols :POP100))
             (i/dataset [:POP.DELTA])
             (i/conj-cols data)))
       
      (i/sel growth-rates
             :cols
             [:NAME :POP100 :POP100.2000 :POP.DELTA]
             :rows
             (range 5))
       
      ;=> ["Abingdon town" 8191 7780 411]
      ;=> ["Accomac town" 519 547 -28]
      ;=> ["Alberta town" 298 306 -8]
      ;=> ["Alexandria city" 139966 128283 11683]
      ;=> ["Allisonia CDP" 117 "" 117]

      증가량을 계산하기 위해 i/minus를 사용한다. 숫자간 연산이기 때문에 replace-empty 함수로 빈 문자열을 0으로 치환. 이 증가량으로 POP.DELTA 컬럼을 만든 후 기존 데이터와 conj-cols 함수로 결합한다.

      scaling variables to simplify variable relationships

      all_160_in_51.P3.csv

      (ns chap7.section3
        (:require [incanter.core :as i]
                  [incanter.io]
                  [incanter.charts :as c]))
       
      (def data-file "data/all_160_in_51.P3.csv")
      (def data
        (i/$order :POP100
                  :asc
                  (incanter.io/read-dataset data-file :header true)))
       
      (def data
        (->> (i/div (i/sel data :cols :POP100) 1000.0)
             (i/dataset [:POP100.1000])
             (i/conj-cols data)))
       
      (def data
        (->> (i/sel data :cols :POP100)
             i/log10
             (i/dataset [:POP100.1000])
             (i/conj-cols data)))

      working with time series data with incanter zoo

      ibm.csv

      (ns chap7.section4
        (:require [incanter.core :as i]
                  [incanter.io]
                  [incanter.zoo :as zoo]
                  [clj-time.format :as tf])
        (:import java.util.Locale))
       
      (def data-file "data/ibm.csv")
      ; 한글 사용 환경일 경우 locale을 바꿔야지 29-Nov-12 와 같은 문자열을 parse할 수 있다.
      (def ^:dynamic *formatter* (tf/with-locale (tf/formatter "dd-MMM-yy") Locale/ENGLISH))
       
      (defn parse-date [date] (tf/parse *formatter* date))
       
      ; :header가 기본적으로 false인데, header가 안 잘리는 문제가 발생. skip으로 첫번째 row를 건너뛴다.
      (def data
        (i/with-data (i/col-names (incanter.io/read-dataset data-file :skip 1) 
                                  [:date-str :open :high :low :close :volumn])
          (->> (i/$map parse-date :date-str)
               (i/dataset [:date])
               (i/conj-cols i/$data))))
      (def data-zoo (zoo/zoo data :date))
      (def data-roll5
        (->> (i/sel data-zoo :cols :close)
             (zoo/roll-mean 5)
             (i/dataset [:five-day])
             (i/conj-cols data-zoo)))
       
      ;=> [:index :volumn :close :low :high :open :date-str :five-day]
      ;=> [#<DateTime 2001-11-26T00:00:00.000Z> 4808500 116.33 115.2 116.4 115.9 "26-Nov-01" 114.54]
      ;=> [#<DateTime 2001-11-27T00:00:00.000Z> 6881200 114.2 114.07 116.3 116.25 "27-Nov-01" 114.1]
      ;=> [#<DateTime 2001-11-28T00:00:00.000Z> 6123800 112.15 112.1 114.38 114.1 "28-Nov-01" 114.58800000000001]
      ;=> [#<DateTime 2001-11-29T00:00:00.000Z> 6062100 114.43 111.81 114.55 112.9 "29-Nov-01" 116.43800000000002]

      zoo

      This is a port of Zoo from R in order to create the basis of a library for time series data.
      (zoo x index-col)

      zoo/roll-mean

      Returns the unweighted mean of the previous n data points.
      (roll-mean n coll)

      smoothing variables to decrease noise

      pg1661.txt

      (ns chap7.section5
        (:require [incanter.core :as i]
                  [incanter.stats :as s]
                  [incanter.charts :as c]
                  [clojure.string :as str]))
       
      (defn tokenize [text]
        (map str/lower-case (re-seq #"\w+" text)))
       
      (defn count-hits [x coll]
        (get (frequencies coll) x 0))
       
      (def data-file "data/pg1661.txt")
      (def windows (partition 500 250 (tokenize (slurp data-file))))
      ; (partition 4 2 "12345678")
      ;=> ((\1 \2 \3 \4) (\3 \4 \5 \6) (\5 \6 \7 \8))
      (def baker-hits (map (partial count-hits "baker") windows))
       
      (defn rolling-fn [f n coll]
        (map f (partition n 1 coll)))
       
      (def baker-avgs (rolling-fn s/mean 10 baker-hits))

      validating sample statistics with bootstrapping

      all_160_in_51.P3.csv

      (ns chap7.section6
        (:require [incanter.core :as i]
                  [incanter.stats :as s]
                  [incanter.io]
                  [incanter.charts :as c]))
       
      (def data-file "data/all_160_in_51.P3.csv")
      (def data (incanter.io/read-dataset data-file :header true))
      (def pop100 (i/sel data :cols :POP100))
      (def samples (s/bootstrap pop100 s/median :size 2000))
       
      (i/view (c/histogram samples))

      median

      (s/mean [1 2 3 4 100])
      ;=> 22.0
      (s/median [1 2 3 4 100])
      ;=> 3.0
      • 평균이 아닌 중간값.
      • 정렬한 다음 중간에 있는 값을 취한다.
      • 평균에 영향을 주는 아주 큰 값을 걸러낼 수 있다. 그룹 대표값으로 사용.

      bootstrap

      modeling linear relationships

      all_160_in_51.P35.csv

      (ns chap7.section7
        (:require [incanter.core :as i]
                  [incanter.stats :as s]
                  [incanter.io]
                  [incanter.charts :as c]))
       
      (def data-file "data/all_160_in_51.P35.csv")
      (def family-data (incanter.io/read-dataset data-file :header true))
      (def housing (i/sel family-data :cols [:HU100]))
      (def families (i/sel family-data :cols [:P035001]))
      (def families-lm (s/linear-model housing families :intercept false))
      (def housing-chart
        (doto
          (c/scatter-plot families
                          housing
                          :title "Relationship of Housing to Families"
                          :x-label "Families"
                          :y-label "Housing"
                          :legend true)
          (c/add-lines families
                       (:fitted families-lm)
                       :series-label "Linear Model")
          (i/view)))

      linear-model

      modeling non-linear relationships

      accident-fatalities.tsv

      (ns chap7.section8
        (:require [incanter.core :as i]
                  [incanter.stats :as s]
                  [incanter.io]
                  [incanter.optimize :as o]
                  [incanter.charts :as c])
        (:import [java.lang Math]))
       
      (def data-file "data/accident-fatalities.tsv")
      (def data (incanter.io/read-dataset data-file :header true :delim \tab))
      (def fatalities
        (->> data
             (i/$rollup :count :0bs. :spdlim)
             (i/$where {:spdlim {:$ne "."}})
             (i/$where {:spdlim {:$ne 0}})
             (i/$order :spdlim :asc)
             (i/to-list)
             (i/dataset [:speed-limit :fatalities])))
      (def speed-limit
        (i/sel fatalities :cols :speed-limit))
      (def fatality-count
        (i/sel fatalities :cols :fatalities))
      (def chart
        (doto
          (c/scatter-plot speed-limit
                          fatality-count
                          :title "Fatalities by Speed Limit (2010)"
                          :x-label "Speed Limit"
                          :y-label "Fatality Count"
                          :legend true)
          (i/view)))
       
      (defn sine-wave [theta x]
        (let [[amp ang-freq phase shift] theta]
          (i/plus (i/mult amp (i/sin (i/plus (i/mult ang-freq x) phase)))
                  shift)))
       
      (def start [3500.0 0.07 Math/PI 2500.0])
      (def nlm
        (o/non-linear-model sine-wave
                            fatality-count
                            speed-limit
                            start))
       
      (-> chart
          (c/add-lines speed-limit
                       (sine-wave start speed-limit))
          (c/add-lines speed-limit
                       (:fitted nlm)))

      non-linear-model

      modeling multimodal bayesian distributions

      all_160_in_51.P3.csv

      (ns chap7.section9
        (:require [incanter.core :as i]
                  [incanter.stats :as s]
                  [incanter.io]
                  [incanter.charts :as c]
                  [incanter.bayes :as b]))
       
      (def census-race
        (i/col-names (incanter.io/read-dataset "data/all_160_in_51.P3.csv" :header true)
                     [:geoid :sumlev :state :county :cbsa :csa :necta :cnecta :name
                      :pop :pop2k :housing :housing2k :total :total2k :white :white2k
                      :black :black2k :indian :indian2k :asian :asian2k :hawaiian :hawaiian2k
                      :other :other2k :multiple :multiple2k]))
      (def census-sample
        (->> census-race
             i/to-list
             shuffle
             (take 60)
             (i/dataset (i/col-names census-race))))
      (def race-keys
        [:white :black :indian :asian :hawaiian :other :multiple])
      (def race-totals
        (into {}
              (map #(vector % (i/sum (i/$ % census-sample)))
                   race-keys)))
      (def y (map second (sort race-totals)))
      (def theta (b/sample-multinomial-params 2000 y))
      (def theta-params
        (into {}
              (map #(vector %1 (i/sel theta :cols %2))
                   (sort race-keys)
                   (range))))
       
      (i/view (c/histogram (:black theta-params)))

      sample-multinomial-params

      finding data errors with benford's law

      all_160_in_51.P35.csv

      (ns chap7.section10
        (:require [incanter.core :as i]
                  [incanter.stats :as s]
                  [incanter.charts :as c]
                  [incanter.io]))
       
      (def data-file "data/all_160_in_51.P35.csv")
      (def data (incanter.io/read-dataset data-file :header true))
      (def bt (s/benford-test (i/sel data :cols :POP100)))
      (def chart
        (let [digits (map inc (:row-levels bt))
              frequency (:table bt)]
          (doto
              (c/bar-chart digits frequency)
              (i/view))))

      benford-test