1 files changed, 165 insertions, 0 deletions
diff --git a/ch26/26_a_2/26_a_2.hs b/ch26/26_a_2/26_a_2.hs
new file mode 100644
index 0000000..fdafaeb
--- /dev/null
+++ b/ch26/26_a_2/26_a_2.hs
@@ -0,0 +1,165 @@
+-- Difficult: write a QuickCheck property that checks whether the observed false
+-- positive rate is close to the requested false positive rate.
+
+
+-- I think testing/proving statistical properties can never be done using finite
+-- data. Thus, this test will only be a good effort for adequate testing. This
+-- also implies that instead of trying to make every test run pass, the overall
+-- test result will be determined by comparing number of passes and number of
+-- discards.
+
+-- Requesting false positive rate happens via the easyList function, so this
+-- exercise is about testing that function.
+
+-- The "is close" in the assignment is unclear. The authors probably meant to
+-- check that the false positive rate is lower or close-to-equal than the
+-- requested rate. For the "close" I add 10% margin to the requested false
+-- positive rate when comparing the actual rate to it.
+
+
+-- After running a few experiments with QuickCheck, I found that the initial
+-- lists passed to the property function were, for example '[(),(),(),()]' or
+-- '[0,0,0,0]' when I restricted the type to [Int].
+--
+-- I want to test the worst cases. It means, increasing the chance of getting a
+-- false positive and see whether it is still less or equally frequent than
+-- requested. The more values with different hashes are inserted to the bloom
+-- filter (it means, the more bits are set), the higher chance of a false
+-- positive. Thus, lists filled with the same value are not suitable.
+--
+-- To make the test simpler and easier to control, I will restrict the input
+-- list type to [Int].
+
+-- The test for worst cases will be as follows:
+--
+--    1) Generate a random number L from range [maxLen..minLen]. The range is
+--       limited so the lists are not too big and the testing not too slow.
+--
+--    2) Create a list XS of length L of random Int values. They don't have to
+--       be unique. I don't assume properties of the hash function used, so it's
+--       not possible to tell that all-unique values would be the worst case
+--       (set as many bits in the filter as possible).
+--
+--    3) Create a list YS of length ysRatio*L of random Int values different
+--       from XS. For the worst case it is important that all values in YS are
+--       different from values in XS for getting as many false positives as
+--       possible. It doesn't matter whether YS values are unique among
+--       themselves. YS list should be as long as possible. If it's short, for
+--       example, just 2 values, and both of them are false positives, the
+--       actual false positive rate would be 2/2 = 1.0. The longer the list is,
+--       the higher chance of getting closer to true false positive rate. I
+--       restricted the length of YS to be a multiple of the length of XS. It
+--       should be long enough, but not too long for the testing to be too slow.
+--
+--    4) Generate a random requested false positive rate.
+--
+--    5) Create a bloom filter via easyList function with values XS. If it
+--       fails, discard the test case.
+--
+--    6) Get an actual false positive rate. For each item in YS check if it is
+--       in the bloom filter. If it is, increment false positive count. Divide
+--       the count by length of YS.
+--
+--    7) If the actual false positive rate is lower or close-to-equal to the
+--       requested false positive rate, the test case passes. If not, it is
+--       discarded.
+
+
+{-- From examples/examples/ch26/examples/BloomCheck.hs and modified --}
+import Test.QuickCheck
+import qualified BloomFilter.Easy as B
+import qualified Data.Map.Lazy as Map
+
+-- Instead of failing the testing, use 'discard' so it can continue and report
+-- the overall results
+handyCheck :: Testable a => Int -> a -> IO ()
+-- 'withDiscardRatio 1' means that the max. number of discards for the testing
+-- to still pass will be 1*number of requested successful runs. The number of
+-- all test runs might be different every time because the testing continues
+-- until either the max. success limit or the max. discard limit is
+-- reached. This means that testing passes when number of discards is < than
+-- number of successful runs, and fails otherwise.
+handyCheck limit p = quickCheck $ withDiscardRatio 1 $ withMaxSuccess limit p
+
+falsePositive :: Gen Double
+falsePositive = choose (epsilon, 1 - epsilon)
+  where epsilon = 1e-6
+
+(=~>) :: Either a b -> (b -> Bool) -> Bool
+k =~> f = either (const discard) f k
+{-- End of code from examples --}
+
+
+randomInt :: Gen Int
+randomInt = chooseInt (minBound :: Int, maxBound :: Int)
+
+randomList :: Int -> Gen [Int]
+randomList len = vectorOf len randomInt
+
+randomListWithout :: Int -> [Int] -> Gen [Int]
+randomListWithout len without = go len withoutMap []
+  -- Use Map for faster lookup of excluded Ints
+  where withoutMap = Map.fromList $ zip without (repeat ())
+        go 0 _ res = return res
+        go len wo res = do
+          i <- randomInt
+          case Map.lookup i wo of
+            Nothing ->
+              -- Good value. Not excluded by the 'without' list. Add it to the
+              -- result.
+              go (len-1) wo (i:res)
+            Just _ ->
+              -- Excluded value. Try again.
+              go len wo res
+
+prop_false_positive_rate =
+  let
+      -- These exact values are set based on experimental test runs for the
+      -- testing not to be too slow
+      minLen = 1
+      maxLen = 50
+      ysRatio = 500
+  in
+    -- Choose number of items in the bloom filter
+    forAll (choose (minLen, maxLen) :: Gen Int) $ \len -> do
+      -- Generate values to insert to the filter
+      forAll (randomList len) $ \xs-> do
+        -- Generate a longer list with values that won't be in the filter
+        forAll (randomListWithout (ysRatio * len) xs) $ \ys-> do
+          forAll falsePositive $ \requestedErrRate ->
+            B.easyList requestedErrRate xs =~> \filt ->
+              -- Add 10% margin to the requested rate
+              if (errRate filt ys) <= (requestedErrRate * 1.1)
+              then True
+              else discard
+
+errRate :: B.Bloom Int -> [Int] -> Double
+errRate filt ys = (fromIntegral fp) / fromIntegral (length ys)
+  where fp = countIsElem filt ys
+
+countIsElem :: B.Bloom Int -> [Int] -> Int
+countIsElem filt ys = length $ filter id $ map (`B.elem` filt) ys
+
+
+-- stack ghci --ghci-options "-L$(pwd) -llookup3"
+
+-- ghci> :l 26_a_2.hs
+-- [1 of 7] Compiling BloomFilter.Hash ( BloomFilter/Hash.hs, interpreted )
+--
+--   ... omitted compiler warnings
+--
+-- [2 of 7] Compiling BloomFilter.Internal ( BloomFilter/Internal.hs, interpreted )
+-- [3 of 7] Compiling BloomFilter.Mutable ( BloomFilter/Mutable.hs, interpreted )
+-- [4 of 7] Compiling BloomFilter      ( BloomFilter.hs, interpreted )
+-- [5 of 7] Compiling BloomFilter.Easy ( BloomFilter/Easy.hs, interpreted )
+-- [6 of 7] Compiling Main             ( 26_a_2.hs, interpreted )
+-- Ok, six modules loaded.
+
+-- ghci> handyCheck 100 prop_false_positive_rate
+-- +++ OK, passed 100 tests; 20 discarded.
+
+-- ghci> handyCheck 600 prop_false_positive_rate
+-- +++ OK, passed 600 tests; 179 discarded.
+
+-- ghci> handyCheck 2000 prop_false_positive_rate
+-- +++ OK, passed 2000 tests; 561 discarded.