-
Notifications
You must be signed in to change notification settings - Fork 56
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
FR: Mann-Whitney-U test #182
base: developer
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
namespace FSharp.Stats.Testing | ||
|
||
// taken/implemented from: https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test#U_statistic | ||
module UTest = | ||
|
||
open FSharp.Stats | ||
open FSharp.Stats.Testing | ||
|
||
// TO DO: Bergmann et al. (2000) showed that there are different implementations of this test that lead to different results. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please check if your computations are correct before PR can be merged. Additionally, check your results against established packages in R, SPSS or puplications rather than volatile wikipedia chapters. |
||
// They implied that some of them are using a false algorithm. Check if the mathematical derivation from above is wrong too. | ||
// Read: https://www.jstor.org/stable/2685616 | ||
let inline private compute (seq1 : seq<'T>) (seq2 : seq<'T>) = | ||
let sortedMerge = | ||
(seq1 |> Seq.map (fun v -> float v, 0), seq2 |> Seq.map (fun v -> float v, 1)) // 0 = first group; 1 = second group | ||
||> Seq.append | ||
|> Seq.sortByDescending (fun (v,groupIndex) -> v) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it necessary that the sequences are sorted? The ranking is order-independent and as I can see there is no need for sorting. |
||
|> Array.ofSeq | ||
// let abundance = // method for equal ranks instead of mean ranks when identical values occur. | ||
// sortedMerge | ||
// |> Array.map ( | ||
// fun v -> Array.filter (fun v2 -> v2 = v) sortedMerge | ||
// >> Array.length | ||
// ) | ||
// let myMap = sortedMerge |> Array.mapi (fun i x -> x, i + 2 - Array.item i abundance) |> Map // wrong: must return mean of ranksums with equal ranks, not always the same rank! | ||
// let rankedMerge = sortedMerge |> Array.map (fun (v,group) -> float myMap.[(v,group)],v,group) | ||
Comment on lines
+18
to
+25
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove comments |
||
let rankedMerge = // method for mean ranks instead of equal ranks when identical values occur. | ||
sortedMerge | ||
|> Array.map fst | ||
|> Rank.rankAverage | ||
|> fun res -> | ||
(sortedMerge, res) | ||
||> Array.map2 (fun (v,group) rank -> rank, v, group) | ||
let calcRankSum group = | ||
rankedMerge | ||
|> Array.filter (fun (rank,v,group') -> group' = group) | ||
|> Array.fold (fun state (rank,v,group') -> state + rank) 0. | ||
Comment on lines
+35
to
+36
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. replace with countBy
Comment on lines
+35
to
+36
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. replace with countBy |
||
let rankSumSeq1 = calcRankSum 0 | ||
let rankSumSeq2 = calcRankSum 1 | ||
let seq1Length = Seq.length seq1 |> float | ||
let seq2Length = Seq.length seq2 |> float | ||
let u1 = seq1Length * seq2Length + (seq1Length * (seq1Length + 1.) / 2.) - rankSumSeq1 | ||
let u2 = seq1Length * seq2Length + (seq2Length * (seq2Length + 1.) / 2.) - rankSumSeq2 | ||
Comment on lines
+41
to
+42
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. temporary results should not be calculated twice |
||
let uMin = min u1 u2 | ||
let z = (uMin - seq1Length * seq2Length / 2.) / System.Math.Sqrt (seq1Length * seq2Length * (seq1Length + seq2Length + 1.) / 12.) | ||
z | ||
|
||
/// Computes a Mann-Whitney U-test. Aka Wilcoxon-Mann-Whitney test. | ||
/// Use this test for independent samples and the Wilcoxon test (= Wilcoxon ranksum test) for dependent samples. | ||
let inline computeUtest (seq1 : seq<'T>) (seq2 : seq<'T>) = | ||
let z = compute seq1 seq2 | ||
TestStatistics.createUTest z |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -235,6 +235,34 @@ let tTestTests = | |
Expect.floatClose Accuracy.low tTest4.Statistic 0.514 "t statistic should be equal." | ||
] | ||
|
||
|
||
[<Tests>] | ||
let uTestTests = | ||
// taken from https://de.wikipedia.org/wiki/Wilcoxon-Mann-Whitney-Test#Beispiel | ||
let testList1 = | ||
([0;400;500;550;600;650;750;800;900;950;1000;1100;1200;1500;1600;1800;1900;2000;2200;3500 ],["M";"W";"M";"W";"M";"W";"M";"M";"W";"W";"M";"M";"W";"M";"W";"M";"M";"M";"M";"M"]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add duplicates to check ranking procedure |
||
||> List.map2 (fun pay sex -> sex, pay) |> List.sortBy fst | ||
|
||
let testList1A = testList1 |> List.choose (fun (sex,pay) -> if sex = "W" then Some pay else None) | ||
let testList1B = testList1 |> List.choose (fun (sex,pay) -> if sex = "M" then Some pay else None) | ||
|
||
let observedResult1 = UTest.computeUtest testList1A testList1B | ||
let expectedResult1 : TestStatistics.UTestTestStatistics = { | ||
Statistic = -1.15 | ||
PValueTwoTailed = 0.2505 | ||
PValueLeft = 0.875 | ||
PValueRight = 0.1253 | ||
} | ||
|
||
testList "Testing.UTest" [ | ||
testCase "TwoSample" <| fun () -> | ||
Expect.floatClose Accuracy.low observedResult1.PValueLeft expectedResult1.PValueLeft "left p-value should be equal" | ||
Expect.floatClose Accuracy.low observedResult1.PValueRight expectedResult1.PValueRight "right p-value should be equal" | ||
Expect.floatClose Accuracy.low observedResult1.PValueTwoTailed expectedResult1.PValueTwoTailed "p-value should be equal" | ||
Expect.floatClose Accuracy.low observedResult1.Statistic expectedResult1.Statistic "test statistic should be equal" | ||
] | ||
|
||
|
||
[<Tests>] | ||
let chiSquaredTests = | ||
// ChiSquared https://www.graphpad.com/quickcalcs/chisquared2/ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The approximation via the z-Distribution is only valid for large samples.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since the distribution is discrete, I cannot find any other (continuous) approximation than the one via normal distribution.
https://stats.stackexchange.com/a/251734
So, what do you suggest? Delete the whole test because of normal approximation might be too inaccurate at low sample sizes?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should aim to implement the exact distribution for n+m<100 and the approximation for larger values
see #213 (comment)