From bf0916aa6fa864ab678825f303d764209cc529ba Mon Sep 17 00:00:00 2001 From: Michael Krabbe Borregaard Date: Fri, 31 Aug 2018 16:56:29 +0200 Subject: [PATCH 01/12] wip use inplace _interdiff --- src/curveball.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/curveball.jl b/src/curveball.jl index 136d273..5b455da 100644 --- a/src/curveball.jl +++ b/src/curveball.jl @@ -29,6 +29,8 @@ end function _curveball!(m::SparseMatrixCSC{Bool, Int}) R, C = size(m) + mcs = maximum(sum(m, dims = 1)) + uniques, shared = Array{Int}(mcs), Array{Bool}(mcs) for rep ∈ 1:5C A, B = rand(1:C,2) @@ -38,7 +40,7 @@ function _curveball!(m::SparseMatrixCSC{Bool, Int}) l_a, l_b = length(a), length(b) # an efficient algorithm since both a and b are sorted - shared, not_shared = _interdif(a, b) + nunique, l_ab = _interdif!(a, b, uniques, shared) l_ab = length(shared) if !(l_ab ∈ (l_a, l_b)) From df8693fcaccb7432cf06153d9d818f09ec35a40f Mon Sep 17 00:00:00 2001 From: Michael Krabbe Borregaard Date: Sun, 2 Sep 2018 16:14:03 +0200 Subject: [PATCH 02/12] Revert "add link change" This reverts commit a15141cf49d1141c6d64210d2d49933bdd456529. --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 97b611b..231958f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,4 +16,4 @@ matrix: - julia: nightly script: - - julia -e 'import Pkg; Pkg.add("https://github.com/EcoJulia/RandomBooleanMatrices.jl"); Pkg.build("RandomBooleanMatrices"); Pkg.test("RandomBooleanMatrices")' + - julia -e 'import Pkg; Pkg.add(pwd()); Pkg.build("RandomBooleanMatrices"); Pkg.test("RandomBooleanMatrices")' From ac6c340f25cddb0933e78024549f50791adb1d70 Mon Sep 17 00:00:00 2001 From: Michael Krabbe Borregaard Date: Fri, 31 Aug 2018 11:05:18 +0200 Subject: [PATCH 03/12] Revert "use `add` rather than `clone`" This reverts commit b094ef8c07942350dc037b70b799232844e4ef3b. --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 231958f..fe207e0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,4 +16,4 @@ matrix: - julia: nightly script: - - julia -e 'import Pkg; Pkg.add(pwd()); Pkg.build("RandomBooleanMatrices"); Pkg.test("RandomBooleanMatrices")' + - julia -e 'import Pkg; Pkg.clone(pwd()); Pkg.build("RandomBooleanMatrices"); Pkg.test("RandomBooleanMatrices")' From d4a23705a79076ab1fe69e912ec9d68d2cabd5fc Mon Sep 17 00:00:00 2001 From: Michael Krabbe Borregaard Date: Fri, 31 Aug 2018 16:56:29 +0200 Subject: [PATCH 04/12] wip use inplace _interdiff From 3cda23ef1b95a8fde892372b1114da4d44aad9d3 Mon Sep 17 00:00:00 2001 From: Michael Krabbe Borregaard Date: Sun, 2 Sep 2018 23:22:37 +0200 Subject: [PATCH 05/12] Add alternative RNG support --- REQUIRE | 1 + src/RandomBooleanMatrices.jl | 20 +++++++++++--------- src/curveball.jl | 4 ++-- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/REQUIRE b/REQUIRE index 05b5ab4..789178a 100644 --- a/REQUIRE +++ b/REQUIRE @@ -1 +1,2 @@ julia 1.0 +RandomNumbers diff --git a/src/RandomBooleanMatrices.jl b/src/RandomBooleanMatrices.jl index aca4d09..f1ec90b 100644 --- a/src/RandomBooleanMatrices.jl +++ b/src/RandomBooleanMatrices.jl @@ -1,6 +1,7 @@ module RandomBooleanMatrices using Random +using RandomNumbers.Xorshifts using SparseArrays include("curveball.jl") @@ -11,20 +12,21 @@ include("curveball.jl") Randomize the sparse boolean Matrix `m` while maintaining row and column sums """ -function randomize_matrix!(m; method::matrixrandomizations = curveball) +function randomize_matrix!(m, rng = Random.GLOBAL_RNG; method::matrixrandomizations = curveball) if method == curveball - return _curveball!(m) + return _curveball!(m, rng) end error("undefined method") end -struct MatrixGenerator +struct MatrixGenerator{R<:AbstractRNG} m::SparseMatrixCSC{Bool, Int} method::matrixrandomizations + rng::R end """ - random_matrices(m; method = curveball) + random_matrices(m [,rng]; method = curveball) Create a matrix generator function that will return a random boolean matrix every time it is called, maintaining row and column sums. Non-boolean input @@ -39,12 +41,12 @@ random1 = rmg() random2 = rmg() `` """ -random_matrices(m::AbstractMatrix; method::matrixrandomizations = curveball) = - MatrixGenerator(m, method) #for this case there's an implicit `copy` already -random_matrices(m::SparseMatrixCSC{Bool, Int}; method::matrixrandomizations = curveball) = - MatrixGenerator(copy(m), method) +random_matrices(m::AbstractMatrix, rng = Xoroshiro128Plus(); method::matrixrandomizations = curveball) = + MatrixGenerator{typeof(rng)}(dropzeros!(sparse(m)), method, rng) +random_matrices(m::SparseMatrixCSC{Bool, Int}, rng = Xoroshiro128Plus(); method::matrixrandomizations = curveball) = + MatrixGenerator{typeof(rng)}(dropzeros(m), method, rng) -(r::MatrixGenerator)(; method::matrixrandomizations = curveball) = copy(randomize_matrix!(r.m, method = r.method)) +(r::MatrixGenerator)(; method::matrixrandomizations = curveball) = copy(randomize_matrix!(r.m, r.rng, method = r.method)) export randomize_matrix!, random_matrices export curveball diff --git a/src/curveball.jl b/src/curveball.jl index 5b455da..8933641 100644 --- a/src/curveball.jl +++ b/src/curveball.jl @@ -27,13 +27,13 @@ function _interdif(v1, v2) inter, dif end -function _curveball!(m::SparseMatrixCSC{Bool, Int}) +function _curveball!(m::SparseMatrixCSC{Bool, Int}, rng = Random.GLOBAL_RNG) R, C = size(m) mcs = maximum(sum(m, dims = 1)) uniques, shared = Array{Int}(mcs), Array{Bool}(mcs) for rep ∈ 1:5C - A, B = rand(1:C,2) + A, B = rand(rng, 1:C,2) # use views directly into the sparse matrix to avoid copying a, b = view(m.rowval, m.colptr[A]:m.colptr[A+1]-1), view(m.rowval, m.colptr[B]:m.colptr[B+1]-1) From ae1d98d45a290d587a571ed4e902ce75b0603b8a Mon Sep 17 00:00:00 2001 From: Michael Krabbe Borregaard Date: Sun, 2 Sep 2018 23:23:18 +0200 Subject: [PATCH 06/12] use StatsBase.sample! --- REQUIRE | 1 + src/RandomBooleanMatrices.jl | 2 ++ src/curveball.jl | 2 +- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/REQUIRE b/REQUIRE index 789178a..b434806 100644 --- a/REQUIRE +++ b/REQUIRE @@ -1,2 +1,3 @@ julia 1.0 RandomNumbers +StatsBase diff --git a/src/RandomBooleanMatrices.jl b/src/RandomBooleanMatrices.jl index f1ec90b..117b00d 100644 --- a/src/RandomBooleanMatrices.jl +++ b/src/RandomBooleanMatrices.jl @@ -3,6 +3,8 @@ module RandomBooleanMatrices using Random using RandomNumbers.Xorshifts using SparseArrays +using StatsBase + include("curveball.jl") @enum matrixrandomizations curveball diff --git a/src/curveball.jl b/src/curveball.jl index 8933641..6b65c12 100644 --- a/src/curveball.jl +++ b/src/curveball.jl @@ -46,7 +46,7 @@ function _curveball!(m::SparseMatrixCSC{Bool, Int}, rng = Random.GLOBAL_RNG) if !(l_ab ∈ (l_a, l_b)) shuffle!(not_shared) L = l_a - l_ab - a .= sort!([shared; not_shared[1:L]]) + sample!(rng, not_shared, view(newa,1:L), replace = false, ordered = true) b .= sort!([shared; not_shared[L+1:end]]) end end From cad04c7eba24f1207f104512854a619d9341c703 Mon Sep 17 00:00:00 2001 From: Michael Krabbe Borregaard Date: Sun, 2 Sep 2018 23:25:04 +0200 Subject: [PATCH 07/12] use in-place interdif --- src/curveball.jl | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/curveball.jl b/src/curveball.jl index 6b65c12..c747bee 100644 --- a/src/curveball.jl +++ b/src/curveball.jl @@ -1,30 +1,33 @@ -function _interdif(v1, v2) - inter, dif = Int[], Int[] +function sortmerge!(v1, v2, ret, iend, jend) + retind = 0 + i,j = firstindex(v1), firstindex(v2) +function _interdif!(v1, v2, inter, dif) + nshared, ndiff = 0, 0 i,j = firstindex(v1), firstindex(v2) @inbounds while i <= lastindex(v1) || j <= lastindex(v2) if j > lastindex(v2) - push!(dif, v1[i]) + dif[ndiff += 1] = v1[i] i+=1 continue elseif i > lastindex(v1) - push!(dif, v2[j]) + dif[ndiff += 1] = v2[j] j+=1 continue end if v1[i] == v2[j] - push!(inter, v1[i]) + inter[nshared += 1] = v1[i] i += 1 j += 1 elseif j > lastindex(v2) || v1[i] < v2[j] - push!(dif, v1[i]) + dif[ndiff += 1] = v1[i] i+=1 elseif i > lastindex(v1) || v1[i] > v2[j] - push!(dif, v2[j]) + dif[ndiff += 1] = v2[j] j+=1 end end - inter, dif + ndiff, nshared end function _curveball!(m::SparseMatrixCSC{Bool, Int}, rng = Random.GLOBAL_RNG) @@ -40,8 +43,7 @@ function _curveball!(m::SparseMatrixCSC{Bool, Int}, rng = Random.GLOBAL_RNG) l_a, l_b = length(a), length(b) # an efficient algorithm since both a and b are sorted - nunique, l_ab = _interdif!(a, b, uniques, shared) - l_ab = length(shared) + l_dif, l_ab = _interdif!(a, b, shared, not_shared) if !(l_ab ∈ (l_a, l_b)) shuffle!(not_shared) From 171a24e3b5c49f891dd925a1b51b90c27824569c Mon Sep 17 00:00:00 2001 From: Michael Krabbe Borregaard Date: Sun, 2 Sep 2018 23:25:38 +0200 Subject: [PATCH 08/12] Define sortmerge function --- src/curveball.jl | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/curveball.jl b/src/curveball.jl index c747bee..ed85602 100644 --- a/src/curveball.jl +++ b/src/curveball.jl @@ -2,6 +2,29 @@ function sortmerge!(v1, v2, ret, iend, jend) retind = 0 i,j = firstindex(v1), firstindex(v2) + @inbounds while i <= iend || j <= jend + if j > jend + ret[retind += 1] = v1[i] + i+=1 + continue + elseif i > iend + ret[retind += 1] = v2[j] + j+=1 + continue + end + if v1[i] == v2[j] + error("The two vectors are not supposed to have overlapping values") + elseif j > lastindex(v2) || v1[i] < v2[j] + ret[retind += 1] = v1[i] + i+=1 + elseif i > lastindex(v1) || v1[i] > v2[j] + ret[retind += 1] = v2[j] + j+=1 + end + end +end + + function _interdif!(v1, v2, inter, dif) nshared, ndiff = 0, 0 i,j = firstindex(v1), firstindex(v2) @@ -50,6 +73,8 @@ function _curveball!(m::SparseMatrixCSC{Bool, Int}, rng = Random.GLOBAL_RNG) L = l_a - l_ab sample!(rng, not_shared, view(newa,1:L), replace = false, ordered = true) b .= sort!([shared; not_shared[L+1:end]]) + sortmerge!(shared, newa, a, l_ab, L) + sortmerge!(shared, newb, b, l_ab, L2) end end From c107de3b2a12098d7aaa046e6818634ced54d3ba Mon Sep 17 00:00:00 2001 From: Michael Krabbe Borregaard Date: Sun, 2 Sep 2018 23:26:38 +0200 Subject: [PATCH 09/12] use StatsBase.sample! 2 --- src/curveball.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/curveball.jl b/src/curveball.jl index ed85602..aa48065 100644 --- a/src/curveball.jl +++ b/src/curveball.jl @@ -69,8 +69,8 @@ function _curveball!(m::SparseMatrixCSC{Bool, Int}, rng = Random.GLOBAL_RNG) l_dif, l_ab = _interdif!(a, b, shared, not_shared) if !(l_ab ∈ (l_a, l_b)) - shuffle!(not_shared) - L = l_a - l_ab + L = l_a - l_ab + resize!(not_shared, l_dif) sample!(rng, not_shared, view(newa,1:L), replace = false, ordered = true) b .= sort!([shared; not_shared[L+1:end]]) sortmerge!(shared, newa, a, l_ab, L) From ccf1af3e8e871bf0db20d61740ea5e10aa0312d2 Mon Sep 17 00:00:00 2001 From: Michael Krabbe Borregaard Date: Sun, 2 Sep 2018 23:27:43 +0200 Subject: [PATCH 10/12] adjustments --- src/curveball.jl | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/curveball.jl b/src/curveball.jl index aa48065..3336d8c 100644 --- a/src/curveball.jl +++ b/src/curveball.jl @@ -55,8 +55,9 @@ end function _curveball!(m::SparseMatrixCSC{Bool, Int}, rng = Random.GLOBAL_RNG) R, C = size(m) - mcs = maximum(sum(m, dims = 1)) - uniques, shared = Array{Int}(mcs), Array{Bool}(mcs) + mcs = min(2maximum(diff(m.colptr)), size(m, 1)) + not_shared, shared = Vector{Int}(undef, mcs), Vector{Int}(undef, mcs) + newa, newb = Vector{Int}(undef, mcs), Vector{Int}(undef, mcs) for rep ∈ 1:5C A, B = rand(rng, 1:C,2) @@ -72,7 +73,8 @@ function _curveball!(m::SparseMatrixCSC{Bool, Int}, rng = Random.GLOBAL_RNG) L = l_a - l_ab resize!(not_shared, l_dif) sample!(rng, not_shared, view(newa,1:L), replace = false, ordered = true) - b .= sort!([shared; not_shared[L+1:end]]) + L2,_ = _interdif!(view(newa, 1:L), not_shared, newa, newb) + sortmerge!(shared, newa, a, l_ab, L) sortmerge!(shared, newb, b, l_ab, L2) end From dcbd157479545b612e840e58fa6e0b3a3782c419 Mon Sep 17 00:00:00 2001 From: Michael Krabbe Borregaard Date: Sun, 2 Sep 2018 23:27:56 +0200 Subject: [PATCH 11/12] resize --- src/curveball.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/curveball.jl b/src/curveball.jl index 3336d8c..fbfe811 100644 --- a/src/curveball.jl +++ b/src/curveball.jl @@ -66,6 +66,7 @@ function _curveball!(m::SparseMatrixCSC{Bool, Int}, rng = Random.GLOBAL_RNG) a, b = view(m.rowval, m.colptr[A]:m.colptr[A+1]-1), view(m.rowval, m.colptr[B]:m.colptr[B+1]-1) l_a, l_b = length(a), length(b) + resize!(not_shared, l_a + l_b) # an efficient algorithm since both a and b are sorted l_dif, l_ab = _interdif!(a, b, shared, not_shared) From d2c3f4612f344660e005528063bfc8c6656bebc3 Mon Sep 17 00:00:00 2001 From: Michael Krabbe Borregaard Date: Sun, 2 Sep 2018 23:37:10 +0200 Subject: [PATCH 12/12] use views rather than resizing --- src/curveball.jl | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/curveball.jl b/src/curveball.jl index fbfe811..1aff4f6 100644 --- a/src/curveball.jl +++ b/src/curveball.jl @@ -66,15 +66,14 @@ function _curveball!(m::SparseMatrixCSC{Bool, Int}, rng = Random.GLOBAL_RNG) a, b = view(m.rowval, m.colptr[A]:m.colptr[A+1]-1), view(m.rowval, m.colptr[B]:m.colptr[B+1]-1) l_a, l_b = length(a), length(b) - resize!(not_shared, l_a + l_b) # an efficient algorithm since both a and b are sorted l_dif, l_ab = _interdif!(a, b, shared, not_shared) if !(l_ab ∈ (l_a, l_b)) L = l_a - l_ab - resize!(not_shared, l_dif) - sample!(rng, not_shared, view(newa,1:L), replace = false, ordered = true) - L2,_ = _interdif!(view(newa, 1:L), not_shared, newa, newb) + + sample!(rng, view(not_shared, Base.OneTo(l_dif)), view(newa,Base.OneTo(L)), replace = false, ordered = true) + L2,_ = _interdif!(view(newa, 1:L), view(not_shared, Base.OneTo(l_dif)), newa, newb) sortmerge!(shared, newa, a, l_ab, L) sortmerge!(shared, newb, b, l_ab, L2)