aboutsummaryrefslogtreecommitdiff
path: root/ch26/26_a_2/26_a_2.hs
diff options
context:
space:
mode:
Diffstat (limited to 'ch26/26_a_2/26_a_2.hs')
-rw-r--r--ch26/26_a_2/26_a_2.hs165
1 files changed, 165 insertions, 0 deletions
diff --git a/ch26/26_a_2/26_a_2.hs b/ch26/26_a_2/26_a_2.hs
new file mode 100644
index 0000000..fdafaeb
--- /dev/null
+++ b/ch26/26_a_2/26_a_2.hs
@@ -0,0 +1,165 @@
+-- Difficult: write a QuickCheck property that checks whether the observed false
+-- positive rate is close to the requested false positive rate.
+
+
+-- I think testing/proving statistical properties can never be done using finite
+-- data. Thus, this test will only be a good effort for adequate testing. This
+-- also implies that instead of trying to make every test run pass, the overall
+-- test result will be determined by comparing number of passes and number of
+-- discards.
+
+-- Requesting false positive rate happens via the easyList function, so this
+-- exercise is about testing that function.
+
+-- The "is close" in the assignment is unclear. The authors probably meant to
+-- check that the false positive rate is lower or close-to-equal than the
+-- requested rate. For the "close" I add 10% margin to the requested false
+-- positive rate when comparing the actual rate to it.
+
+
+-- After running a few experiments with QuickCheck, I found that the initial
+-- lists passed to the property function were, for example '[(),(),(),()]' or
+-- '[0,0,0,0]' when I restricted the type to [Int].
+--
+-- I want to test the worst cases. It means, increasing the chance of getting a
+-- false positive and see whether it is still less or equally frequent than
+-- requested. The more values with different hashes are inserted to the bloom
+-- filter (it means, the more bits are set), the higher chance of a false
+-- positive. Thus, lists filled with the same value are not suitable.
+--
+-- To make the test simpler and easier to control, I will restrict the input
+-- list type to [Int].
+
+-- The test for worst cases will be as follows:
+--
+-- 1) Generate a random number L from range [maxLen..minLen]. The range is
+-- limited so the lists are not too big and the testing not too slow.
+--
+-- 2) Create a list XS of length L of random Int values. They don't have to
+-- be unique. I don't assume properties of the hash function used, so it's
+-- not possible to tell that all-unique values would be the worst case
+-- (set as many bits in the filter as possible).
+--
+-- 3) Create a list YS of length ysRatio*L of random Int values different
+-- from XS. For the worst case it is important that all values in YS are
+-- different from values in XS for getting as many false positives as
+-- possible. It doesn't matter whether YS values are unique among
+-- themselves. YS list should be as long as possible. If it's short, for
+-- example, just 2 values, and both of them are false positives, the
+-- actual false positive rate would be 2/2 = 1.0. The longer the list is,
+-- the higher chance of getting closer to true false positive rate. I
+-- restricted the length of YS to be a multiple of the length of XS. It
+-- should be long enough, but not too long for the testing to be too slow.
+--
+-- 4) Generate a random requested false positive rate.
+--
+-- 5) Create a bloom filter via easyList function with values XS. If it
+-- fails, discard the test case.
+--
+-- 6) Get an actual false positive rate. For each item in YS check if it is
+-- in the bloom filter. If it is, increment false positive count. Divide
+-- the count by length of YS.
+--
+-- 7) If the actual false positive rate is lower or close-to-equal to the
+-- requested false positive rate, the test case passes. If not, it is
+-- discarded.
+
+
+{-- From examples/examples/ch26/examples/BloomCheck.hs and modified --}
+import Test.QuickCheck
+import qualified BloomFilter.Easy as B
+import qualified Data.Map.Lazy as Map
+
+-- Instead of failing the testing, use 'discard' so it can continue and report
+-- the overall results
+handyCheck :: Testable a => Int -> a -> IO ()
+-- 'withDiscardRatio 1' means that the max. number of discards for the testing
+-- to still pass will be 1*number of requested successful runs. The number of
+-- all test runs might be different every time because the testing continues
+-- until either the max. success limit or the max. discard limit is
+-- reached. This means that testing passes when number of discards is < than
+-- number of successful runs, and fails otherwise.
+handyCheck limit p = quickCheck $ withDiscardRatio 1 $ withMaxSuccess limit p
+
+falsePositive :: Gen Double
+falsePositive = choose (epsilon, 1 - epsilon)
+ where epsilon = 1e-6
+
+(=~>) :: Either a b -> (b -> Bool) -> Bool
+k =~> f = either (const discard) f k
+{-- End of code from examples --}
+
+
+randomInt :: Gen Int
+randomInt = chooseInt (minBound :: Int, maxBound :: Int)
+
+randomList :: Int -> Gen [Int]
+randomList len = vectorOf len randomInt
+
+randomListWithout :: Int -> [Int] -> Gen [Int]
+randomListWithout len without = go len withoutMap []
+ -- Use Map for faster lookup of excluded Ints
+ where withoutMap = Map.fromList $ zip without (repeat ())
+ go 0 _ res = return res
+ go len wo res = do
+ i <- randomInt
+ case Map.lookup i wo of
+ Nothing ->
+ -- Good value. Not excluded by the 'without' list. Add it to the
+ -- result.
+ go (len-1) wo (i:res)
+ Just _ ->
+ -- Excluded value. Try again.
+ go len wo res
+
+prop_false_positive_rate =
+ let
+ -- These exact values are set based on experimental test runs for the
+ -- testing not to be too slow
+ minLen = 1
+ maxLen = 50
+ ysRatio = 500
+ in
+ -- Choose number of items in the bloom filter
+ forAll (choose (minLen, maxLen) :: Gen Int) $ \len -> do
+ -- Generate values to insert to the filter
+ forAll (randomList len) $ \xs-> do
+ -- Generate a longer list with values that won't be in the filter
+ forAll (randomListWithout (ysRatio * len) xs) $ \ys-> do
+ forAll falsePositive $ \requestedErrRate ->
+ B.easyList requestedErrRate xs =~> \filt ->
+ -- Add 10% margin to the requested rate
+ if (errRate filt ys) <= (requestedErrRate * 1.1)
+ then True
+ else discard
+
+errRate :: B.Bloom Int -> [Int] -> Double
+errRate filt ys = (fromIntegral fp) / fromIntegral (length ys)
+ where fp = countIsElem filt ys
+
+countIsElem :: B.Bloom Int -> [Int] -> Int
+countIsElem filt ys = length $ filter id $ map (`B.elem` filt) ys
+
+
+-- stack ghci --ghci-options "-L$(pwd) -llookup3"
+
+-- ghci> :l 26_a_2.hs
+-- [1 of 7] Compiling BloomFilter.Hash ( BloomFilter/Hash.hs, interpreted )
+--
+-- ... omitted compiler warnings
+--
+-- [2 of 7] Compiling BloomFilter.Internal ( BloomFilter/Internal.hs, interpreted )
+-- [3 of 7] Compiling BloomFilter.Mutable ( BloomFilter/Mutable.hs, interpreted )
+-- [4 of 7] Compiling BloomFilter ( BloomFilter.hs, interpreted )
+-- [5 of 7] Compiling BloomFilter.Easy ( BloomFilter/Easy.hs, interpreted )
+-- [6 of 7] Compiling Main ( 26_a_2.hs, interpreted )
+-- Ok, six modules loaded.
+
+-- ghci> handyCheck 100 prop_false_positive_rate
+-- +++ OK, passed 100 tests; 20 discarded.
+
+-- ghci> handyCheck 600 prop_false_positive_rate
+-- +++ OK, passed 600 tests; 179 discarded.
+
+-- ghci> handyCheck 2000 prop_false_positive_rate
+-- +++ OK, passed 2000 tests; 561 discarded.