From 1f1f93b50337224b17afe0b1f3f5d83177977a01 Mon Sep 17 00:00:00 2001 From: kyo227 <43748518+kyo227@users.noreply.github.com> Date: Thu, 8 Sep 2022 21:51:08 +0800 Subject: [PATCH 1/8] The inequalities in innerjoin are added to the leftjoin --- src/join/join.jl | 198 +++++++++++++++++++++++++++++++++++++++-------- src/join/main.jl | 38 +++++++-- 2 files changed, 196 insertions(+), 40 deletions(-) diff --git a/src/join/join.jl b/src/join/join.jl index 8e0ac90a..f7b07bc2 100644 --- a/src/join/join.jl +++ b/src/join/join.jl @@ -461,35 +461,101 @@ end -function _join_left(dsl, dsr, ::Val{T}; onleft, onright, makeunique = false, mapformats = [true, true], stable = false, alg = HeapSort, check = true, accelerate = false, method = :sort, threads = true, multiple_match::Bool = false, multiple_match_name = :multiple, obs_id = [false, false], obs_id_name = :obs_id) where T +function _join_left(dsl, dsr, ::Val{T}; onleft, onright,onright_range, makeunique = false, mapformats = [true, true], stable = false, alg = HeapSort, check = true, accelerate = false, droprangecols = true, strict_inequality = [false, false], method = :sort, threads = true, multiple_match = false, multiple_match_name = :multiple, obs_id = [false, false], obs_id_name = :obs_id) where T isempty(dsl) && return copy(dsl) - if method == :hash - ranges, a, idx, minval, reps, sz, right_cols = _find_ranges_for_join_using_hash(dsl, dsr, onleft, onright, mapformats, makeunique, Val(T); threads = threads) - elseif method == :sort - oncols_left = onleft - oncols_right = onright + oncols_left = onleft + oncols_right = onright + type = :both + right_range_cols = Int[] + + if onright_range !== nothing + left_range_col = oncols_left[end] + right_range_cols = index(dsr)[filter!(!isequal(nothing), collect(onright_range))] + if droprangecols + right_cols = setdiff(1:length(index(dsr)), [oncols_right; right_range_cols]) + else + right_cols = setdiff(1:length(index(dsr)), oncols_right) + end + + oncols_right = [oncols_right; first(right_range_cols)] + if onright_range[1] !== nothing + if strict_inequality[1] + type = :leftstrict + else + type = :left + end + else + if strict_inequality[2] + type = :rightstrict + else + type = :right + end + end + else right_cols = setdiff(1:length(index(dsr)), oncols_right) - if !makeunique && !isempty(intersect(_names(dsl), _names(dsr)[right_cols])) - throw(ArgumentError("duplicate column names, pass `makeunique = true` to make them unique using a suffix automatically." )) + end + + if !makeunique && !isempty(intersect(_names(dsl), _names(dsr)[right_cols])) + throw(ArgumentError("duplicate column names, pass `makeunique = true` to make them unique using a suffix automatically." )) + end + + nsfpaj = true + # if the columns for inequality like join are PA we cannot use the fast path + if type != :both + if any(i-> DataAPI.refpool(_columns(dsr)[i]) !== nothing, right_range_cols) + nsfpaj = false end + end + if method == :hash && (onright_range === nothing || length(onleft) > 1) + if onright_range !== nothing + ranges, a, idx, minval, reps, sz, right_cols_2 = _find_ranges_for_join_using_hash(dsl, dsr, onleft[1:end-1], oncols_right[1:end-1], mapformats, true, Val(T); threads = threads) + filter!(!=(0), reps) + pushfirst!(reps, 1) + our_cumsum!(reps) + pop!(reps) + grng = GIVENRANGE(idx, reps, Int[], length(reps)) + starts, idx, last_valid_range = _sort_for_join_after_hash(dsr, right_range_cols[1], stable, alg, mapformats, nsfpaj, grng; threads = threads) + _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], length(oncols_left); type = type, nsfpaj = nsfpaj, threads = threads) + else + ranges, a, idx, minval, reps, sz, right_cols = _find_ranges_for_join_using_hash(dsl, dsr, onleft, onright, mapformats, makeunique, Val(T); threads = threads) + end + else ranges = Vector{UnitRange{T}}(undef, nrow(dsl)) - if length(oncols_left) == 1 && nrow(dsr)>1 - success, result = _join_left_dict(dsl, dsr, ranges, oncols_left, oncols_right, right_cols, Val(T); makeunique = makeunique, mapformats = mapformats, check = check, threads = threads, multiple_match = multiple_match, multiple_match_name = multiple_match_name, obs_id = obs_id, obs_id_name = obs_id_name) + if length(oncols_left) == 1 && type == :both && nrow(dsr)>1 + success, result = _join_inner_dict(dsl, dsr, ranges, oncols_left, oncols_right, right_cols, Val(T); makeunique = makeunique, mapformats = mapformats, check = check, threads = threads, multiple_match = multiple_match, multiple_match_name = multiple_match_name, obs_id = obs_id, obs_id_name = obs_id_name) if success return result end end - idx, uniquemode = _find_permute_and_fill_range_for_join!(ranges, dsr, dsl, oncols_right, oncols_left, stable, alg, mapformats, accelerate; threads = threads) + idx, uniquemode = _find_permute_and_fill_range_for_join!(ranges, dsr, dsl, oncols_right, oncols_left, stable, alg, mapformats, accelerate && (onright_range == nothing || length(oncols_right)>1); nsfpaj = nsfpaj, threads = threads) - for j in 1:length(oncols_left) - _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], j; threads = threads) + for j in 1:length(oncols_left)-1 + _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], j; nsfpaj = nsfpaj, threads = threads) end + _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], length(oncols_left); type = type, nsfpaj = nsfpaj, threads = threads) end + new_ends = map(x -> max(1, length(x)), ranges) our_cumsum!(new_ends) total_length = new_ends[end] + inbits = nothing + revised_ends = nothing + if length(right_range_cols) == 2 + inbits = zeros(Bool, total_length) + # TODO any optimisation is needed for pa? + _fl = identity + _fr = identity + if mapformats[1] + _fl = getformat(dsl, left_range_col) + end + if mapformats[2] + _fr = getformat(dsr, right_range_cols[2]) + end + revised_ends = _mark_lt_part!(inbits, _columns(dsl)[left_range_col], _columns(dsr)[right_range_cols[2]], _fl, _fr, ranges, idx, new_ends, total_length < typemax(Int32) ? Val(Int32) : Val(Int64); strict = strict_inequality[2], threads = threads) + end + if check @assert total_length < 10*nrow(dsl) "the output data set will be very large ($(total_length)×$(ncol(dsl)+length(right_cols))) compared to the left data set size ($(nrow(dsl))×$(ncol(dsl))), make sure that the `on` keyword is selected properly, alternatively, pass `check = false` to ignore this error." end @@ -500,7 +566,6 @@ function _join_left(dsl, dsr, ::Val{T}; onleft, onright, makeunique = false, map res = [] for j in 1:length(index(dsl)) - addmissing = false _res = allocatecol(_columns(dsl)[j], total_length, addmissing = false) if DataAPI.refpool(_res) !== nothing # fill_val = DataAPI.invrefpool(_res)[missing] @@ -509,7 +574,6 @@ function _join_left(dsl, dsr, ::Val{T}; onleft, onright, makeunique = false, map _fill_oncols_left_table_left!(_res, _columns(dsl)[j], ranges, new_ends, total_length, missing; threads = threads) end push!(res, _res) - end if dsl isa SubDataset newds = Dataset(res, copy(index(dsl)), copycols = false) @@ -517,29 +581,29 @@ function _join_left(dsl, dsr, ::Val{T}; onleft, onright, makeunique = false, map newds = Dataset(res, Index(copy(index(dsl).lookup), copy(index(dsl).names), copy(index(dsl).format)), copycols = false) end - for j in 1:length(right_cols) _res = allocatecol(_columns(dsr)[right_cols[j]], total_length) if DataAPI.refpool(_res) !== nothing fill_val = DataAPI.invrefpool(_res)[missing] - _fill_right_cols_table_left!(_res.refs, view(DataAPI.refarray(_columns(dsr)[right_cols[j]]), idx), ranges, new_ends, total_length, fill_val; threads = threads) + _fill_right_cols_table_left!(_res.refs, view(DataAPI.refarray(_columns(dsr)[right_cols[j]]), idx), ranges, new_ends, total_length, fill_val, threads = threads) else - _fill_right_cols_table_left!(_res, view(_columns(dsr)[right_cols[j]], idx), ranges, new_ends, total_length, missing; threads = threads) + _fill_right_cols_table_left!(_res, view(_columns(dsr)[right_cols[j]], idx), ranges, new_ends, total_length, missing, threads = threads) end push!(_columns(newds), _res) - new_var_name = make_unique([_names(dsl); _names(dsr)[right_cols[j]]], makeunique = makeunique)[end] push!(index(newds), new_var_name) setformat!(newds, index(newds)[new_var_name], getformat(dsr, _names(dsr)[right_cols[j]])) end + if multiple_match insertcols!(newds, ncol(newds)+1, multiple_match_name => multiple_match_col, unsupported_copy_cols = false, makeunique = makeunique) end + if obs_id[1] obs_id_name1 = Symbol(obs_id_name, "_left") obs_id_left = allocatecol(nrow(dsl) < typemax(Int32) ? Int32 : Int64, total_length) _fill_oncols_left_table_left!(obs_id_left, 1:nrow(dsl), ranges, new_ends, total_length, missing; threads = threads) - insertcols!(newds, ncol(newds)+1, obs_id_name1 => obs_id_left, unsupported_copy_cols = false, makeunique = makeunique) + insertcols!(dsnewdsl, ncol(newds)+1, obs_id_name1 => obs_id_left, unsupported_copy_cols = false, makeunique = makeunique) end if obs_id[2] obs_id_name2 = Symbol(obs_id_name, "_right") @@ -551,29 +615,81 @@ function _join_left(dsl, dsr, ::Val{T}; onleft, onright, makeunique = false, map end -function _join_left!(dsl::Dataset, dsr::AbstractDataset, ::Val{T}; onleft, onright, makeunique = false, mapformats = [true, true], stable = false, alg = HeapSort, check = true, accelerate = false, method = :sort, threads = true, multiple_match = false, multiple_match_name = :multiple, obs_id = [false, false], obs_id_name = :obs_id) where T +function _join_left!(dsl::Dataset, dsr::AbstractDataset, ::Val{T}; onleft, onright, onright_range, makeunique = false,mapformats = [true, true], stable = false, alg = HeapSort, check = true, accelerate = false, droprangecols = true, strict_inequality = [false, false], method = :sort, threads = true, multiple_match = false, multiple_match_name = :multiple, obs_id = [false, false], obs_id_name = :obs_id) where T isempty(dsl) && return dsl - if method == :hash - ranges, a, idx, minval, reps, sz, right_cols = _find_ranges_for_join_using_hash(dsl, dsr, onleft, onright, mapformats, makeunique, Val(T); threads = threads) - elseif method == :sort - oncols_left = onleft - oncols_right = onright + oncols_left = onleft + oncols_right = onright + type = :both + right_range_cols = Int[] + + if onright_range !== nothing + left_range_col = oncols_left[end] + right_range_cols = index(dsr)[filter!(!isequal(nothing), collect(onright_range))] + if droprangecols + right_cols = setdiff(1:length(index(dsr)), [oncols_right; right_range_cols]) + else + right_cols = setdiff(1:length(index(dsr)), oncols_right) + end + + oncols_right = [oncols_right; first(right_range_cols)] + if onright_range[1] !== nothing + if strict_inequality[1] + type = :leftstrict + else + type = :left + end + else + if strict_inequality[2] + type = :rightstrict + else + type = :right + end + end + else right_cols = setdiff(1:length(index(dsr)), oncols_right) - if !makeunique && !isempty(intersect(_names(dsl), _names(dsr)[right_cols])) - throw(ArgumentError("duplicate column names, pass `makeunique = true` to make them unique using a suffix automatically." )) + end + + if !makeunique && !isempty(intersect(_names(dsl), _names(dsr)[right_cols])) + throw(ArgumentError("duplicate column names, pass `makeunique = true` to make them unique using a suffix automatically." )) + end + + nsfpaj = true + # if the columns for inequality like join are PA we cannot use the fast path + if type != :both + if any(i-> DataAPI.refpool(_columns(dsr)[i]) !== nothing, right_range_cols) + nsfpaj = false end + end + + if method == :hash && (onright_range === nothing || length(onleft) > 1) + if onright_range !== nothing + ranges, a, idx, minval, reps, sz, right_cols_2 = _find_ranges_for_join_using_hash(dsl, dsr, onleft[1:end-1], oncols_right[1:end-1], mapformats, true, Val(T); threads = threads) + filter!(!=(0), reps) + pushfirst!(reps, 1) + our_cumsum!(reps) + pop!(reps) + grng = GIVENRANGE(idx, reps, Int[], length(reps)) + starts, idx, last_valid_range = _sort_for_join_after_hash(dsr, right_range_cols[1], stable, alg, mapformats, nsfpaj, grng; threads = threads) + _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], length(oncols_left); type = type, nsfpaj = nsfpaj, threads = threads) + else + ranges, a, idx, minval, reps, sz, right_cols = _find_ranges_for_join_using_hash(dsl, dsr, onleft, onright, mapformats, makeunique, Val(T); threads = threads) + end + else ranges = Vector{UnitRange{T}}(undef, nrow(dsl)) - if length(oncols_left) == 1 && nrow(dsr)>1 - success, result = _join_left!_dict(dsl, dsr, ranges, oncols_left, oncols_right, right_cols, Val(T); makeunique = makeunique, mapformats = mapformats, check = check, threads = threads, multiple_match = multiple_match, multiple_match_name = multiple_match_name, obs_id = obs_id, obs_id_name = obs_id_name) + if length(oncols_left) == 1 && type == :both && nrow(dsr)>1 + success, result = _join_inner_dict(dsl, dsr, ranges, oncols_left, oncols_right, right_cols, Val(T); makeunique = makeunique, mapformats = mapformats, check = check, threads = threads, multiple_match = multiple_match, multiple_match_name = multiple_match_name, obs_id = obs_id, obs_id_name = obs_id_name) if success return result end end - idx, uniquemode = _find_permute_and_fill_range_for_join!(ranges, dsr, dsl, oncols_right, oncols_left, stable, alg, mapformats, accelerate, threads = threads) - for j in 1:length(oncols_left) - _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], j, threads = threads) + idx, uniquemode = _find_permute_and_fill_range_for_join!(ranges, dsr, dsl, oncols_right, oncols_left, stable, alg, mapformats, accelerate && (onright_range == nothing || length(oncols_right)>1); nsfpaj = nsfpaj, threads = threads) + + for j in 1:length(oncols_left)-1 + _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], j; nsfpaj = nsfpaj, threads = threads) end + _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], length(oncols_left); type = type, nsfpaj = nsfpaj, threads = threads) end + if !all(x->length(x) <= 1, ranges) throw(ArgumentError("`leftjoin!` can only be used when each observation in left data set matches at most one observation from right data set")) end @@ -582,6 +698,22 @@ function _join_left!(dsl::Dataset, dsr::AbstractDataset, ::Val{T}; onleft, onrig our_cumsum!(new_ends) total_length = new_ends[end] + inbits = nothing + revised_ends = nothing + if length(right_range_cols) == 2 + inbits = zeros(Bool, total_length) + # TODO any optimisation is needed for pa? + _fl = identity + _fr = identity + if mapformats[1] + _fl = getformat(dsl, left_range_col) + end + if mapformats[2] + _fr = getformat(dsr, right_range_cols[2]) + end + revised_ends = _mark_lt_part!(inbits, _columns(dsl)[left_range_col], _columns(dsr)[right_range_cols[2]], _fl, _fr, ranges, idx, new_ends, total_length < typemax(Int32) ? Val(Int32) : Val(Int64); strict = strict_inequality[2], threads = threads) + end + if check @assert total_length < 10*nrow(dsl) "the output data set will be very large ($(total_length)×$(ncol(dsl)+length(right_cols))) compared to the left data set size ($(nrow(dsl))×$(ncol(dsl))), make sure that the `on` keyword is selected properly, alternatively, pass `check = false` to ignore this error." end diff --git a/src/join/main.jl b/src/join/main.jl index d59cb3d2..105c3054 100644 --- a/src/join/main.jl +++ b/src/join/main.jl @@ -131,7 +131,7 @@ julia> leftjoin(dsl, dsr, on = :year, mapformats = true) # Use formats for datas 4 │ 2012 true missing ``` """ -function DataAPI.leftjoin(dsl::AbstractDataset, dsr::AbstractDataset; on = nothing, makeunique = false, mapformats::Union{Bool, Vector{Bool}} = true, stable = false, alg = HeapSort, check = true, accelerate = false, method::Symbol = :sort, threads::Bool = true, multiple_match::Bool = false, multiple_match_name = :multiple, obs_id::Union{Bool, Vector{Bool}} = false, obs_id_name = :obs_id) +function DataAPI.leftjoin(dsl::AbstractDataset, dsr::AbstractDataset; on = nothing, makeunique = false, mapformats::Union{Bool, Vector{Bool}} = true, stable = false, alg = HeapSort, check = true, accelerate = false, droprangecols::Bool = true, strict_inequality = false, method::Symbol = :sort, threads::Bool = true, multiple_match::Bool = false, multiple_match_name = :multiple, obs_id::Union{Bool, Vector{Bool}} = false, obs_id_name = :obs_id) !(method in (:hash, :sort)) && throw(ArgumentError("method must be :hash or :sort")) on === nothing && throw(ArgumentError("`on` keyword must be specified")) if !(on isa AbstractVector) @@ -151,14 +151,26 @@ function DataAPI.leftjoin(dsl::AbstractDataset, dsr::AbstractDataset; on = nothi length(obs_id) !== 2 && throw(ArgumentError("`obs_id` must be a Bool or a vector of Bool with size two")) end + # strict_inequality + if !(strict_inequality isa AbstractVector) + strict_inequality = repeat([strict_inequality], 2) + else + length(strict_inequality) !== 2 && throw(ArgumentError("`strict_inequality` must be a Bool or a vector of Bool with size two")) + end + if typeof(on) <: AbstractVector{<:Union{AbstractString, Symbol}} onleft = multiple_getindex(index(dsl), on) onright = multiple_getindex(index(dsr), on) - + onright_range = nothing elseif (typeof(on) <: AbstractVector{<:Pair{<:ColumnIndex, <:ColumnIndex}}) || (typeof(on) <: AbstractVector{<:Pair{<:AbstractString, <:AbstractString}}) onleft = multiple_getindex(index(dsl), map(x->x.first, on)) onright = multiple_getindex(index(dsr), map(x->x.second, on)) - + onright_range = nothing + elseif (typeof(on) <: AbstractVector{<:Pair{<:ColumnIndex, <:Any}}) || (typeof(on) <: AbstractVector{<:Pair{<:AbstractString, <:Any}}) + onleft = multiple_getindex(index(dsl), map(x->x.first, on)) + onright = multiple_getindex(index(dsr), map(x->x.second, on[1:end-1])) + onright_range = on[end].second + !(onright_range isa Tuple) && throw(ArgumentError("For range join the last element of `on` keyword argument for the right table must be a Tuple of column names")) else throw(ArgumentError("`on` keyword must be a vector of column names or a vector of pairs of column names")) end @@ -170,7 +182,7 @@ end Variant of `leftjoin` that performs `leftjoin` in place for special case that the number of matching rows from the right data set is at most one. """ -function leftjoin!(dsl::Dataset, dsr::AbstractDataset; on = nothing, makeunique = false, mapformats::Union{Bool, Vector{Bool}} = true, stable = false, alg = HeapSort, accelerate = false, method::Symbol = :sort, threads::Bool = true, multiple_match::Bool=false, multiple_match_name = :multiple, obs_id::Union{Bool, Vector{Bool}} = false, obs_id_name = :obs_id) +function leftjoin!(dsl::Dataset, dsr::AbstractDataset; on = nothing, makeunique = false, mapformats::Union{Bool, Vector{Bool}} = true, stable = false, alg = HeapSort, accelerate = false, strict_inequality = false, method::Symbol = :sort, threads::Bool = true, droprangecols::Bool = true, multiple_match::Bool=false, multiple_match_name = :multiple, obs_id::Union{Bool, Vector{Bool}} = false, obs_id_name = :obs_id) !(method in (:hash, :sort)) && throw(ArgumentError("method must be :hash or :sort")) on === nothing && throw(ArgumentError("`on` keyword must be specified")) if !(on isa AbstractVector) @@ -188,18 +200,30 @@ function leftjoin!(dsl::Dataset, dsr::AbstractDataset; on = nothing, makeunique else length(obs_id) !== 2 && throw(ArgumentError("`obs_id` must be a Bool or a vector of Bool with size two")) end + # strict_inequality + if !(strict_inequality isa AbstractVector) + strict_inequality = repeat([strict_inequality], 2) + else + length(strict_inequality) !== 2 && throw(ArgumentError("`strict_inequality` must be a Bool or a vector of Bool with size two")) + end + if typeof(on) <: AbstractVector{<:Union{AbstractString, Symbol}} onleft = multiple_getindex(index(dsl), on) onright = multiple_getindex(index(dsr), on) - + onright_range = nothing elseif (typeof(on) <: AbstractVector{<:Pair{<:ColumnIndex, <:ColumnIndex}}) || (typeof(on) <: AbstractVector{<:Pair{<:AbstractString, <:AbstractString}}) onleft = multiple_getindex(index(dsl), map(x->x.first, on)) onright = multiple_getindex(index(dsr), map(x->x.second, on)) - + onright_range = nothing + elseif (typeof(on) <: AbstractVector{<:Pair{<:ColumnIndex, <:Any}}) || (typeof(on) <: AbstractVector{<:Pair{<:AbstractString, <:Any}}) + onleft = multiple_getindex(index(dsl), map(x->x.first, on)) + onright = multiple_getindex(index(dsr), map(x->x.second, on[1:end-1])) + onright_range = on[end].second + !(onright_range isa Tuple) && throw(ArgumentError("For range join the last element of `on` keyword argument for the right table must be a Tuple of column names")) else throw(ArgumentError("`on` keyword must be a vector of column names or a vector of pairs of column names")) end - _join_left!(dsl, dsr, nrow(dsr) < typemax(Int32) ? Val(Int32) : Val(Int64), onleft = onleft, onright = onright, makeunique = makeunique, mapformats = mapformats, check = false, method = method, threads = threads, multiple_match = multiple_match, multiple_match_name = multiple_match_name, obs_id = obs_id, obs_id_name = obs_id_name) + _join_left!(dsl, dsr, nrow(dsr) < typemax(Int32) ? Val(Int32) : Val(Int64), onleft = onleft, onright = onright, onright_range = onright_range, stable = stable, strict_inequality = strict_inequality, makeunique = makeunique, mapformats = mapformats,accelerate = accelerate, check = false,droprangecols = droprangecols, method = method, threads = threads, multiple_match = multiple_match, multiple_match_name = multiple_match_name, obs_id = obs_id, obs_id_name = obs_id_name) end """ From 10331b27fe7c25f202a8f33cb7622f6041a1cfba Mon Sep 17 00:00:00 2001 From: kyo227 <43748518+kyo227@users.noreply.github.com> Date: Thu, 8 Sep 2022 22:19:36 +0800 Subject: [PATCH 2/8] fix some bug --- src/join/main.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/join/main.jl b/src/join/main.jl index 105c3054..d3a77136 100644 --- a/src/join/main.jl +++ b/src/join/main.jl @@ -174,7 +174,7 @@ function DataAPI.leftjoin(dsl::AbstractDataset, dsr::AbstractDataset; on = nothi else throw(ArgumentError("`on` keyword must be a vector of column names or a vector of pairs of column names")) end - _join_left(dsl, dsr, nrow(dsr) < typemax(Int32) ? Val(Int32) : Val(Int64), onleft = onleft, onright = onright, makeunique = makeunique, mapformats = mapformats, stable = stable, alg = alg, check = check, accelerate = accelerate, method = method, threads = threads, multiple_match = multiple_match, multiple_match_name = multiple_match_name, obs_id = obs_id, obs_id_name = obs_id_name) + _join_left(dsl, dsr, nrow(dsr) < typemax(Int32) ? Val(Int32) : Val(Int64), onleft = onleft, onright = onright, onright_range = onright_range, stable = stable, strict_inequality = strict_inequality, makeunique = makeunique, mapformats = mapformats,accelerate = accelerate, check = false,droprangecols = droprangecols, method = method, threads = threads, multiple_match = multiple_match, multiple_match_name = multiple_match_name, obs_id = obs_id, obs_id_name = obs_id_name) end """ From 1a203a16e42933e234dafe7e73c9a2f2f805118d Mon Sep 17 00:00:00 2001 From: kyo227 <43748518+kyo227@users.noreply.github.com> Date: Sat, 10 Sep 2022 13:49:44 +0800 Subject: [PATCH 3/8] Fix some bugs for 1 => (:lower, :higher) situation. --- src/join/join.jl | 89 +++++++++++++++++++++++++++++++++++++++--------- test/join.jl | 36 ++++++++++++++++++++ 2 files changed, 108 insertions(+), 17 deletions(-) diff --git a/src/join/join.jl b/src/join/join.jl index f7b07bc2..f6a56ccc 100644 --- a/src/join/join.jl +++ b/src/join/join.jl @@ -146,6 +146,22 @@ function _fill_val_join!(x, r2, val, inbits, r) end end end + +function left_fill_val_join!(x, val, inbits, r) + cnt = 1 + lo = r.start + for i in r + if inbits[i] + x[cnt+lo-1] = val + else + if cnt == 1 + x[cnt+lo-1] = val + end + end + cnt += 1 + end +end + # F1 and F2 are here for type stability when threads = false function _find_ranges_for_join!(ranges, x, y, _fl::F1, _fr::F2, ::Val{T1}, ::Val{T2}; type = :both, threads = true) where {T1, T2, F1, F2} if type == :both @@ -226,11 +242,21 @@ function _find_ranges_for_join_pa!(ranges, x, invpool, y, _fl::F1, _fr::F2, ::Va end -function _fill_oncols_left_table_left!(_res, x, ranges, en, total, fill_val; threads = true) - @_threadsfor threads for i in 1:length(x) - i == 1 ? lo = 1 : lo = en[i - 1] + 1 - hi = en[i] - _fill_val_join!(_res, lo:hi, x[i]) +function _fill_oncols_left_table_left!(_res, x, ranges, en, total, fill_val;inbits = nothing, en2 = nothing, threads = true) + if inbits === nothing + @_threadsfor threads for i in 1:length(x) + i == 1 ? lo = 1 : lo = en[i - 1] + 1 + hi = en[i] + _fill_val_join!(_res, lo:hi, x[i]) + end + else + @_threadsfor threads for i in 1:length(x) + i == 1 ? lo = 1 : lo = en[i - 1] + 1 + hi = en[i] + # @show sum(view(inbits, lo:hi)) + # sum(view(inbits, lo:hi)) == 0 && continue + left_fill_val_join!(_res, x[i], inbits, lo:hi) + end end @_threadsfor threads for i in en[length(x)]+1:total _res[i] = fill_val @@ -273,11 +299,19 @@ function _fill_oncols_left_table_anti!(_res, x, ranges, en, total; threads = tru end end -function _fill_right_cols_table_left!(_res, x, ranges, en, total, fill_val; threads = true) - @_threadsfor threads for i in 1:length(ranges) - i == 1 ? lo = 1 : lo = en[i - 1] + 1 - hi = en[i] - length(ranges[i]) == 0 ? _fill_val_join!(_res, lo:hi, fill_val) : copyto!(_res, lo, x, ranges[i].start, length(ranges[i])) +function _fill_right_cols_table_left!(_res, x, ranges, en, total, fill_val; inbits = nothing, en2 = nothing, threads = true) + if inbits === nothing + @_threadsfor threads for i in 1:length(ranges) + i == 1 ? lo = 1 : lo = en[i - 1] + 1 + hi = en[i] + length(ranges[i]) == 0 ? _fill_val_join!(_res, lo:hi, fill_val) : copyto!(_res, lo, x, ranges[i].start, length(ranges[i])) + end + else + @_threadsfor threads for i in 1:length(ranges) + i == 1 ? lo = 1 : lo = en[i - 1] + 1 + hi = en[i] + left_fill_right_col_range!(_res, x, ranges[i], inbits, lo:hi) + end end end @@ -294,6 +328,22 @@ function _fill_right_col_range!(_res, r2, x, ranges, inbits, r) end end +function left_fill_right_col_range!(_res, x, ranges, inbits, r) + cnt = 1 + cnt_r = 1 + lo = r.start + for i in r + if inbits[i] + _res[lo+cnt-1] = x[ranges[cnt_r]] + cnt += 1 + else + _res[lo+cnt-1] = missing + cnt += 1 + end + cnt_r += 1 + end +end + function _fill_right_cols_table_inner!(_res, x, ranges, en, total; inbits = nothing, en2 = nothing, threads = true) if inbits === nothing @_threadsfor threads for i in 1:length(ranges) @@ -569,9 +619,9 @@ function _join_left(dsl, dsr, ::Val{T}; onleft, onright,onright_range, makeuniqu _res = allocatecol(_columns(dsl)[j], total_length, addmissing = false) if DataAPI.refpool(_res) !== nothing # fill_val = DataAPI.invrefpool(_res)[missing] - _fill_oncols_left_table_left!(_res.refs, DataAPI.refarray(_columns(dsl)[j]), ranges, new_ends, total_length, missing; threads = threads) + _fill_oncols_left_table_left!(_res.refs, DataAPI.refarray(_columns(dsl)[j]), ranges, new_ends, total_length, missing; inbits = inbits, en2 = revised_ends, threads = threads) else - _fill_oncols_left_table_left!(_res, _columns(dsl)[j], ranges, new_ends, total_length, missing; threads = threads) + _fill_oncols_left_table_left!(_res, _columns(dsl)[j], ranges, new_ends, total_length, missing; inbits = inbits, en2 = revised_ends,threads = threads) end push!(res, _res) end @@ -585,9 +635,9 @@ function _join_left(dsl, dsr, ::Val{T}; onleft, onright,onright_range, makeuniqu _res = allocatecol(_columns(dsr)[right_cols[j]], total_length) if DataAPI.refpool(_res) !== nothing fill_val = DataAPI.invrefpool(_res)[missing] - _fill_right_cols_table_left!(_res.refs, view(DataAPI.refarray(_columns(dsr)[right_cols[j]]), idx), ranges, new_ends, total_length, fill_val, threads = threads) + _fill_right_cols_table_left!(_res.refs, view(DataAPI.refarray(_columns(dsr)[right_cols[j]]), idx), ranges, new_ends, total_length, fill_val,inbits = inbits, en2 = revised_ends, threads = threads) else - _fill_right_cols_table_left!(_res, view(_columns(dsr)[right_cols[j]], idx), ranges, new_ends, total_length, missing, threads = threads) + _fill_right_cols_table_left!(_res, view(_columns(dsr)[right_cols[j]], idx), ranges, new_ends, total_length, missing,inbits = inbits, en2 = revised_ends, threads = threads) end push!(_columns(newds), _res) new_var_name = make_unique([_names(dsl); _names(dsr)[right_cols[j]]], makeunique = makeunique)[end] @@ -602,16 +652,21 @@ function _join_left(dsl, dsr, ::Val{T}; onleft, onright,onright_range, makeuniqu if obs_id[1] obs_id_name1 = Symbol(obs_id_name, "_left") obs_id_left = allocatecol(nrow(dsl) < typemax(Int32) ? Int32 : Int64, total_length) - _fill_oncols_left_table_left!(obs_id_left, 1:nrow(dsl), ranges, new_ends, total_length, missing; threads = threads) + _fill_oncols_left_table_left!(obs_id_left, 1:nrow(dsl), ranges, new_ends, total_length, missing;inbits = inbits, en2 = revised_ends, threads = threads) insertcols!(dsnewdsl, ncol(newds)+1, obs_id_name1 => obs_id_left, unsupported_copy_cols = false, makeunique = makeunique) end if obs_id[2] obs_id_name2 = Symbol(obs_id_name, "_right") obs_id_right = allocatecol(T, total_length) - _fill_right_cols_table_left!(obs_id_right, idx, ranges, new_ends, total_length, missing, threads = threads) + _fill_right_cols_table_left!(obs_id_right, idx, ranges, new_ends, total_length, missing, inbits = inbits, en2 = revised_ends, threads = threads) insertcols!(newds, ncol(newds)+1, obs_id_name2 => obs_id_right, unsupported_copy_cols = false, makeunique = makeunique) end - newds + + if length(right_range_cols) == 2 + filter(newds,:,by = !ismissing, type = any) + else + newds + end end diff --git a/test/join.jl b/test/join.jl index 443c3879..87d47ec6 100644 --- a/test/join.jl +++ b/test/join.jl @@ -1903,6 +1903,23 @@ end @test inn_r1_a == inn_r1_t @test inn_r1_v_a == inn_r1_t + left_r1 = leftjoin(store, roster, on = [:date => (:start_date, nothing)], makeunique = true, stable = true) + left_r1_v = leftjoin(store, view(roster, :, :), on = [:date => (:start_date, nothing)], makeunique = true, stable = true) + left_r1_a = leftjoin(store, roster, on = [:date => (:start_date, nothing)], makeunique = true, stable = true, accelerate = true) + left_r1_v_a = leftjoin(store, view(roster, :, :), on = [:date => (:start_date, nothing)], makeunique = true, stable = true, accelerate = true) + + @test left_r1 == leftjoin(store, roster, on = [:date => (:start_date, nothing)], makeunique = true, stable = true, method = :hash) + @test left_r1_v == leftjoin(store, view(roster, :, :), on = [:date => (:start_date, nothing)], makeunique = true, stable = true, method = :hash) + @test left_r1_a == leftjoin(store, roster, on = [:date => (:start_date, nothing)], makeunique = true, stable = true, accelerate = true, method = :hash) + @test left_r1_v_a == leftjoin(store, view(roster, :, :), on = [:date => (:start_date, nothing)], makeunique = true, stable = true, accelerate = true, method = :hash) + + + left_r1_t = Dataset([Union{Missing, Date}[Date("2019-10-05"), Date("2019-10-05"), Date("2019-10-05"), Date("2019-10-05"), Date("2019-10-05"), Date("2019-10-05"), Date("2019-10-05"), Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-02"), Date("2019-10-02"), Date("2019-10-02"), Date("2019-10-02"), Date("2020-01-01"), Date("2020-01-01"), Date("2020-01-01"), Date("2020-01-01"), Date("2020-01-01"), Date("2020-01-01"), Date("2020-01-01"), Date("2020-01-01"), Date("2019-10-01"), Date("2019-10-01"), Date("2019-10-02"), Date("2019-10-02"), Date("2019-10-02"), Date("2019-10-02"), Date("2019-10-05"), Date("2019-10-05"), Date("2019-10-05"), Date("2019-10-05"), Date("2019-10-05"), Date("2019-10-05"), Date("2019-10-05"), Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03")], Union{Missing, String}["A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "A", "A", "A", "A", "A", "A", "A", "A", "B", "B", "A", "A", "A", "A", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "B", "B"], Union{Missing, String}["A", "B", "A", "B", "A", "B", "A", "B", "A", "B", "A", "B", "A", "B", "A", "B", "A", "B", "A", "B", "A", "B", "A", "B", "A", "B", "A", "B", "A", "B", "A", "B", "A", "B", "A", "B", "A", "B", "A", "B", "A", "B", "A", "B", "A", "B", "A", "B", "A", "B", "A", "B", "A", "B", "A", "B", "A", "B", "A", "B", "A", "B"], Union{Missing, Int64}[1, 5, 2, 6, 3, 7, 4, 8, 1, 5, 2, 6, 3, 7, 4, 8, 1, 5, 2, 6, 1, 5, 2, 6, 3, 7, 4, 8, 1, 5, 1, 5, 2, 6, 1, 5, 2, 6, 3, 7, 4, 8, 1, 5, 2, 6, 3, 7, 4, 8, 1, 5, 2, 6, 3, 7, 1, 5, 2, 6, 3, 7], Union{Missing, Date}[Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-05"), Date("2019-10-05"), Date("2019-10-06"), Date("2019-10-06"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-05"), Date("2019-10-05"), Date("2019-10-06"), Date("2019-10-06"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-05"), Date("2019-10-05"), Date("2019-10-06"), Date("2019-10-06"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-05"), Date("2019-10-05"), Date("2019-10-06"), Date("2019-10-06"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-05"), Date("2019-10-05"), Date("2019-10-06"), Date("2019-10-06"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-05"), Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-05"), Date("2019-10-05")]], ["date", "store", "store_1", "employee_ID", "end_date"]) + @test left_r1 == left_r1_t + @test left_r1_v == left_r1_t + @test left_r1_a == left_r1_t + @test left_r1_v_a == left_r1_t + inn_r1 = innerjoin(store, roster, on = [:store => :store, :date => (:start_date, nothing)], stable = true) inn_r1_v = innerjoin(store, view(roster, :, [1,2,4,3]), on = [:store => :store, :date => (:start_date, nothing)], stable = true) inn_r1_a = innerjoin(store, roster, on = [:store => :store, :date => (:start_date, nothing)], stable = true, accelerate = true) @@ -1918,6 +1935,23 @@ end @test inn_r1_v == inn_r1_t @test inn_r1_a == inn_r1_t @test inn_r1_v_a == inn_r1_t + + left_r1 = leftjoin(store, roster, on = [:store => :store, :date => (:start_date, nothing)], stable = true) + left_r1_v = leftjoin(store, view(roster, :, [1,2,4,3]), on = [:store => :store, :date => (:start_date, nothing)], stable = true) + left_r1_a = leftjoin(store, roster, on = [:store => :store, :date => (:start_date, nothing)], stable = true, accelerate = true) + left_r1_v_a = leftjoin(store, view(roster, :, [1,2,4,3]), on = [:store => :store, :date => (:start_date, nothing)], stable = true, accelerate = true) + + @test left_r1 == leftjoin(store, roster, on = [:store => :store, :date => (:start_date, nothing)], stable = true, method = :hash) + @test left_r1_v == leftjoin(store, view(roster, :, [1,2,4,3]), on = [:store => :store, :date => (:start_date, nothing)], stable = true, method = :hash) + @test left_r1_a == leftjoin(store, roster, on = [:store => :store, :date => (:start_date, nothing)], stable = true, accelerate = true, method = :hash) + @test left_r1_v_a == leftjoin(store, view(roster, :, [1,2,4,3]), on = [:store => :store, :date => (:start_date, nothing)], stable = true, accelerate = true, method = :hash) + + left_r1_t = Dataset([Union{Missing, Date}[Date("2019-10-05"), Date("2019-10-05"), Date("2019-10-05"), Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-02"), Date("2019-10-02"), Date("2020-01-01"), Date("2020-01-01"), Date("2020-01-01"), Date("2020-01-01"), Date("2019-10-01"), Date("2019-10-02"), Date("2019-10-02"), Date("2019-10-05"), Date("2019-10-05"), Date("2019-10-05"), Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03")], Union{Missing, String}["A", "A", "A", "A", "A", "A", "A", "A", "B", "B", "A", "A", "A", "A", "B", "A", "A", "B", "B", "B", "B", "B", "B", "B", "B", "A", "A", "A", "B", "B", "B"], Union{Missing, Int64}[1, 2, 3, 4, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 1, 2, 5, 6, 7, 8, 5, 6, 7, 8, 1, 2, 3, 5, 6, 7], Union{Missing, Date}[Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-05"), Date("2019-10-06"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-05"), Date("2019-10-06"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-05"), Date("2019-10-06"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-05"), Date("2019-10-06"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-05"), Date("2019-10-06"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-05")]], ["date", "store", "employee_ID", "end_date"]) + @test left_r1 == left_r1_t + @test left_r1_v == left_r1_t + @test left_r1_a == left_r1_t + @test left_r1_v_a == left_r1_t + inn_r1 = innerjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], stable = true) inn_r1_v = innerjoin(store, view(roster, :, [1,2,4,3]), on = [:store => :store, :date => (:start_date, :end_date)], stable = true) @@ -1935,6 +1969,8 @@ end @test inn_r1_a == inn_r1_t @test inn_r1_v_a == inn_r1_t + # + inn_r1 = innerjoin(store, roster, on = [:store => :store, :date => (:end_date, :start_date)], stable = true) inn_r1_v = innerjoin(store, view(roster, :, [1,2,4,3]), on = [:store => :store, :date => (:end_date, :start_date)], stable = true) inn_r1_a = innerjoin(store, roster, on = [:store => :store, :date => (:end_date, :start_date)], stable = true, accelerate = true) From baf921f2a2711aaf6df4bec0ddd698f357445943 Mon Sep 17 00:00:00 2001 From: kyo227 <43748518+kyo227@users.noreply.github.com> Date: Sat, 10 Sep 2022 17:47:50 +0800 Subject: [PATCH 4/8] I used a different method to deal with the problem of (:lower, :higher) in LeftJoin, that the ranges and inbit do not match, so the result will have extra lines. Last version use the filter() function, which greatly affected the efficency, so i changed it. --- src/join/join.jl | 179 +++++++++++++++++++++++++++-------------------- 1 file changed, 103 insertions(+), 76 deletions(-) diff --git a/src/join/join.jl b/src/join/join.jl index f6a56ccc..7d5b454e 100644 --- a/src/join/join.jl +++ b/src/join/join.jl @@ -147,21 +147,6 @@ function _fill_val_join!(x, r2, val, inbits, r) end end -function left_fill_val_join!(x, val, inbits, r) - cnt = 1 - lo = r.start - for i in r - if inbits[i] - x[cnt+lo-1] = val - else - if cnt == 1 - x[cnt+lo-1] = val - end - end - cnt += 1 - end -end - # F1 and F2 are here for type stability when threads = false function _find_ranges_for_join!(ranges, x, y, _fl::F1, _fr::F2, ::Val{T1}, ::Val{T2}; type = :both, threads = true) where {T1, T2, F1, F2} if type == :both @@ -241,23 +226,13 @@ function _find_ranges_for_join_pa!(ranges, x, invpool, y, _fl::F1, _fr::F2, ::Va end - -function _fill_oncols_left_table_left!(_res, x, ranges, en, total, fill_val;inbits = nothing, en2 = nothing, threads = true) - if inbits === nothing - @_threadsfor threads for i in 1:length(x) - i == 1 ? lo = 1 : lo = en[i - 1] + 1 - hi = en[i] - _fill_val_join!(_res, lo:hi, x[i]) - end - else - @_threadsfor threads for i in 1:length(x) - i == 1 ? lo = 1 : lo = en[i - 1] + 1 - hi = en[i] - # @show sum(view(inbits, lo:hi)) - # sum(view(inbits, lo:hi)) == 0 && continue - left_fill_val_join!(_res, x[i], inbits, lo:hi) - end +function _fill_oncols_left_table_left!(_res, x, ranges, en, total, fill_val; threads = true) + @_threadsfor threads for i in 1:length(x) + i == 1 ? lo = 1 : lo = en[i - 1] + 1 + hi = en[i] + _fill_val_join!(_res, lo:hi, x[i]) end + @_threadsfor threads for i in en[length(x)]+1:total _res[i] = fill_val end @@ -301,44 +276,46 @@ end function _fill_right_cols_table_left!(_res, x, ranges, en, total, fill_val; inbits = nothing, en2 = nothing, threads = true) if inbits === nothing - @_threadsfor threads for i in 1:length(ranges) + @_threadsfor threads for i in 1:length(ranges) i == 1 ? lo = 1 : lo = en[i - 1] + 1 hi = en[i] length(ranges[i]) == 0 ? _fill_val_join!(_res, lo:hi, fill_val) : copyto!(_res, lo, x, ranges[i].start, length(ranges[i])) end else @_threadsfor threads for i in 1:length(ranges) - i == 1 ? lo = 1 : lo = en[i - 1] + 1 - hi = en[i] - left_fill_right_col_range!(_res, x, ranges[i], inbits, lo:hi) + if i == 1 + lo2 = 1 + else + lo2 = en2[i-1] + 1 + end + hi2 = en2[i] + left_fill_right_col_range!(_res, lo2:hi2, x, ranges[i], en[i]) end end end -function _fill_right_col_range!(_res, r2, x, ranges, inbits, r) +function left_fill_right_col_range!(_res, r2, x, ranges, r) cnt = 1 cnt_r = 1 - lo = r2.start - for i in r - if inbits[i] - _res[lo+cnt-1] = x[ranges[cnt_r]] + if length(r2) == 0 + _res[r] = missing + else + for i in r2 + _res[r-length(r2)+1+cnt-1] = x[ranges[cnt_r]] cnt += 1 + cnt_r += 1 end - cnt_r += 1 end end -function left_fill_right_col_range!(_res, x, ranges, inbits, r) +function _fill_right_col_range!(_res, r2, x, ranges, inbits, r) cnt = 1 cnt_r = 1 - lo = r.start + lo = r2.start for i in r if inbits[i] _res[lo+cnt-1] = x[ranges[cnt_r]] cnt += 1 - else - _res[lo+cnt-1] = missing - cnt += 1 end cnt_r += 1 end @@ -370,25 +347,49 @@ function _fill_right_cols_table_inner!(_res, x, ranges, en, total; inbits = noth end end -function _create_multiple_match_col_left(ranges, total_length) +function _create_multiple_match_col_left(ranges, en, total_length) res = allocatecol(Bool, total_length) cnt = 0 - for i in 1:length(ranges) - if length(ranges[i]) == 0 - cnt += 1 - res[cnt] = false - else - if length(ranges[i]) == 1 + if en === nothing + for i in 1:length(ranges) + if length(ranges[i]) == 0 cnt += 1 res[cnt] = false else - for j in ranges[i] + if length(ranges[i]) == 1 + cnt += 1 + res[cnt] = false + else + for j in ranges[i] + cnt += 1 + res[cnt] = true + end + end + end + end + else + for i in 1:length(ranges) + if i == 1 + lo = 1 + else + lo = en[i - 1] + 1 + end + hi = en[i] + if length(lo:hi) == 0 + cnt+=1 + res[cnt] = false + elseif length(lo:hi) == 1 + cnt += 1 + res[cnt] = false + else + for j in lo:hi cnt += 1 res[cnt] = true end end end end + res end function _create_multiple_match_col_inner(ranges, en, total_length) @@ -604,14 +605,29 @@ function _join_left(dsl, dsr, ::Val{T}; onleft, onright,onright_range, makeuniqu _fr = getformat(dsr, right_range_cols[2]) end revised_ends = _mark_lt_part!(inbits, _columns(dsl)[left_range_col], _columns(dsr)[right_range_cols[2]], _fl, _fr, ranges, idx, new_ends, total_length < typemax(Int32) ? Val(Int32) : Val(Int64); strict = strict_inequality[2], threads = threads) + inbit_ranges = Vector{UnitRange{T}}(undef, nrow(dsl)) + + @_threadsfor threads for i in 1:length(ranges) + if i == 1 + lo2 = 1 + else + lo2 = revised_ends[i-1] + 1 + end + hi2 = revised_ends[i] + inbit_ranges[i] = lo2:hi2 + end + + new_ends = map(x -> max(1, length(x)), inbit_ranges) + our_cumsum!(new_ends) + total_length = new_ends[end] end - + if check @assert total_length < 10*nrow(dsl) "the output data set will be very large ($(total_length)×$(ncol(dsl)+length(right_cols))) compared to the left data set size ($(nrow(dsl))×$(ncol(dsl))), make sure that the `on` keyword is selected properly, alternatively, pass `check = false` to ignore this error." end if multiple_match - multiple_match_col = _create_multiple_match_col_left(ranges, total_length) + multiple_match_col = _create_multiple_match_col_left(ranges,revised_ends, total_length) end res = [] @@ -619,12 +635,13 @@ function _join_left(dsl, dsr, ::Val{T}; onleft, onright,onright_range, makeuniqu _res = allocatecol(_columns(dsl)[j], total_length, addmissing = false) if DataAPI.refpool(_res) !== nothing # fill_val = DataAPI.invrefpool(_res)[missing] - _fill_oncols_left_table_left!(_res.refs, DataAPI.refarray(_columns(dsl)[j]), ranges, new_ends, total_length, missing; inbits = inbits, en2 = revised_ends, threads = threads) + _fill_oncols_left_table_left!(_res.refs, DataAPI.refarray(_columns(dsl)[j]), ranges, new_ends, total_length, missing; threads = threads) else - _fill_oncols_left_table_left!(_res, _columns(dsl)[j], ranges, new_ends, total_length, missing; inbits = inbits, en2 = revised_ends,threads = threads) + _fill_oncols_left_table_left!(_res, _columns(dsl)[j], ranges, new_ends, total_length, missing; threads = threads) end push!(res, _res) end + if dsl isa SubDataset newds = Dataset(res, copy(index(dsl)), copycols = false) else @@ -635,9 +652,9 @@ function _join_left(dsl, dsr, ::Val{T}; onleft, onright,onright_range, makeuniqu _res = allocatecol(_columns(dsr)[right_cols[j]], total_length) if DataAPI.refpool(_res) !== nothing fill_val = DataAPI.invrefpool(_res)[missing] - _fill_right_cols_table_left!(_res.refs, view(DataAPI.refarray(_columns(dsr)[right_cols[j]]), idx), ranges, new_ends, total_length, fill_val,inbits = inbits, en2 = revised_ends, threads = threads) + _fill_right_cols_table_left!(_res.refs, view(DataAPI.refarray(_columns(dsr)[right_cols[j]]), idx), ranges, new_ends, total_length, fill_val; inbits = inbits, en2 = revised_ends, threads = threads) else - _fill_right_cols_table_left!(_res, view(_columns(dsr)[right_cols[j]], idx), ranges, new_ends, total_length, missing,inbits = inbits, en2 = revised_ends, threads = threads) + _fill_right_cols_table_left!(_res, view(_columns(dsr)[right_cols[j]], idx), ranges, new_ends, total_length, missing; inbits = inbits, en2 = revised_ends, threads = threads) end push!(_columns(newds), _res) new_var_name = make_unique([_names(dsl); _names(dsr)[right_cols[j]]], makeunique = makeunique)[end] @@ -652,22 +669,17 @@ function _join_left(dsl, dsr, ::Val{T}; onleft, onright,onright_range, makeuniqu if obs_id[1] obs_id_name1 = Symbol(obs_id_name, "_left") obs_id_left = allocatecol(nrow(dsl) < typemax(Int32) ? Int32 : Int64, total_length) - _fill_oncols_left_table_left!(obs_id_left, 1:nrow(dsl), ranges, new_ends, total_length, missing;inbits = inbits, en2 = revised_ends, threads = threads) + _fill_oncols_left_table_left!(obs_id_left, 1:nrow(dsl), ranges, new_ends, total_length, missing; threads = threads) insertcols!(dsnewdsl, ncol(newds)+1, obs_id_name1 => obs_id_left, unsupported_copy_cols = false, makeunique = makeunique) end if obs_id[2] obs_id_name2 = Symbol(obs_id_name, "_right") obs_id_right = allocatecol(T, total_length) - _fill_right_cols_table_left!(obs_id_right, idx, ranges, new_ends, total_length, missing, inbits = inbits, en2 = revised_ends, threads = threads) + _fill_right_cols_table_left!(obs_id_right, idx, ranges, new_ends, total_length, missing; inbits = inbits, en2 = revised_ends, threads = threads) insertcols!(newds, ncol(newds)+1, obs_id_name2 => obs_id_right, unsupported_copy_cols = false, makeunique = makeunique) end + newds - if length(right_range_cols) == 2 - filter(newds,:,by = !ismissing, type = any) - else - newds - end - end function _join_left!(dsl::Dataset, dsr::AbstractDataset, ::Val{T}; onleft, onright, onright_range, makeunique = false,mapformats = [true, true], stable = false, alg = HeapSort, check = true, accelerate = false, droprangecols = true, strict_inequality = [false, false], method = :sort, threads = true, multiple_match = false, multiple_match_name = :multiple, obs_id = [false, false], obs_id_name = :obs_id) where T @@ -745,10 +757,6 @@ function _join_left!(dsl::Dataset, dsr::AbstractDataset, ::Val{T}; onleft, onrig _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], length(oncols_left); type = type, nsfpaj = nsfpaj, threads = threads) end - if !all(x->length(x) <= 1, ranges) - throw(ArgumentError("`leftjoin!` can only be used when each observation in left data set matches at most one observation from right data set")) - end - new_ends = map(x -> max(1, length(x)), ranges) our_cumsum!(new_ends) total_length = new_ends[end] @@ -767,6 +775,25 @@ function _join_left!(dsl::Dataset, dsr::AbstractDataset, ::Val{T}; onleft, onrig _fr = getformat(dsr, right_range_cols[2]) end revised_ends = _mark_lt_part!(inbits, _columns(dsl)[left_range_col], _columns(dsr)[right_range_cols[2]], _fl, _fr, ranges, idx, new_ends, total_length < typemax(Int32) ? Val(Int32) : Val(Int64); strict = strict_inequality[2], threads = threads) + inbit_ranges = Vector{UnitRange{T}}(undef, nrow(dsl)) + + @_threadsfor threads for i in 1:length(ranges) + if i == 1 + lo2 = 1 + else + lo2 = revised_ends[i-1] + 1 + end + hi2 = revised_ends[i] + inbit_ranges[i] = lo2:hi2 + end + + new_ends = map(x -> max(1, length(x)), inbit_ranges) + our_cumsum!(new_ends) + total_length = new_ends[end] + end + + if !all(x->length(x) <= 1, inbit_ranges) + throw(ArgumentError("`leftjoin!` can only be used when each observation in left data set matches at most one observation from right data set")) end if check @@ -774,16 +801,16 @@ function _join_left!(dsl::Dataset, dsr::AbstractDataset, ::Val{T}; onleft, onrig end if multiple_match - multiple_match_col = _create_multiple_match_col_left(ranges, total_length) + multiple_match_col = _create_multiple_match_col_left(ranges,revised_ends, total_length) end for j in 1:length(right_cols) _res = allocatecol(_columns(dsr)[right_cols[j]], total_length) if DataAPI.refpool(_res) !== nothing fill_val = DataAPI.invrefpool(_res)[missing] - _fill_right_cols_table_left!(_res.refs, view(DataAPI.refarray(_columns(dsr)[right_cols[j]]), idx), ranges, new_ends, total_length, fill_val, threads = threads) + _fill_right_cols_table_left!(_res.refs, view(DataAPI.refarray(_columns(dsr)[right_cols[j]]), idx), ranges, new_ends, total_length, fill_val; inbits = inbits, en2 = revised_ends, threads = threads) else - _fill_right_cols_table_left!(_res, view(_columns(dsr)[right_cols[j]], idx), ranges, new_ends, total_length, missing, threads = threads) + _fill_right_cols_table_left!(_res, view(_columns(dsr)[right_cols[j]], idx), ranges, new_ends, total_length, missing; inbits = inbits, en2 = revised_ends, threads = threads) end push!(_columns(dsl), _res) new_var_name = make_unique([_names(dsl); _names(dsr)[right_cols[j]]], makeunique = makeunique)[end] @@ -802,7 +829,7 @@ function _join_left!(dsl::Dataset, dsr::AbstractDataset, ::Val{T}; onleft, onrig if obs_id[2] obs_id_name2 = Symbol(obs_id_name, "_right") obs_id_right = allocatecol(T, total_length) - _fill_right_cols_table_left!(obs_id_right, idx, ranges, new_ends, total_length, missing, threads = threads) + _fill_right_cols_table_left!(obs_id_right, idx, ranges, new_ends, total_length, missing; inbits = inbits, en2 = revised_ends, threads = threads) insertcols!(dsl, ncol(dsl)+1, obs_id_name2 => obs_id_right, unsupported_copy_cols = false, makeunique = makeunique) end From c257c269ef1defa5bc899e15528df1613a988d7f Mon Sep 17 00:00:00 2001 From: kyo227 <43748518+kyo227@users.noreply.github.com> Date: Mon, 12 Sep 2022 23:51:21 +0800 Subject: [PATCH 5/8] Fix some bugs, and finish the test of leftjoin in join.jl. --- src/join/join.jl | 74 +++++++---- test/join.jl | 322 ++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 370 insertions(+), 26 deletions(-) diff --git a/src/join/join.jl b/src/join/join.jl index 7d5b454e..39ffcdbb 100644 --- a/src/join/join.jl +++ b/src/join/join.jl @@ -274,7 +274,7 @@ function _fill_oncols_left_table_anti!(_res, x, ranges, en, total; threads = tru end end -function _fill_right_cols_table_left!(_res, x, ranges, en, total, fill_val; inbits = nothing, en2 = nothing, threads = true) +function _fill_right_cols_table_left!(_res, x, ranges, en, total, fill_val; ra = nothing, inbits = nothing, en2 = nothing, threads = true) if inbits === nothing @_threadsfor threads for i in 1:length(ranges) i == 1 ? lo = 1 : lo = en[i - 1] + 1 @@ -283,26 +283,25 @@ function _fill_right_cols_table_left!(_res, x, ranges, en, total, fill_val; inbi end else @_threadsfor threads for i in 1:length(ranges) - if i == 1 - lo2 = 1 - else - lo2 = en2[i-1] + 1 - end + i == 1 ? lo2 = 1 : lo2 = en2[i-1] + 1 hi2 = en2[i] - left_fill_right_col_range!(_res, lo2:hi2, x, ranges[i], en[i]) + left_fill_right_col_range!(_res, lo2:hi2, x, ranges[i], en[i], ra[i], inbits) end end end -function left_fill_right_col_range!(_res, r2, x, ranges, r) +function left_fill_right_col_range!(_res, r2, x, ranges, r, ra, inbits) cnt = 1 cnt_r = 1 if length(r2) == 0 + _res = allowmissing(_res) _res[r] = missing else - for i in r2 - _res[r-length(r2)+1+cnt-1] = x[ranges[cnt_r]] - cnt += 1 + for i in ra + if inbits[i] + _res[r-length(r2)+1+cnt-1] = x[ranges[cnt_r]] + cnt += 1 + end cnt_r += 1 end end @@ -593,6 +592,7 @@ function _join_left(dsl, dsr, ::Val{T}; onleft, onright,onright_range, makeuniqu inbits = nothing revised_ends = nothing + ra = nothing if length(right_range_cols) == 2 inbits = zeros(Bool, total_length) # TODO any optimisation is needed for pa? @@ -604,16 +604,20 @@ function _join_left(dsl, dsr, ::Val{T}; onleft, onright,onright_range, makeuniqu if mapformats[2] _fr = getformat(dsr, right_range_cols[2]) end - revised_ends = _mark_lt_part!(inbits, _columns(dsl)[left_range_col], _columns(dsr)[right_range_cols[2]], _fl, _fr, ranges, idx, new_ends, total_length < typemax(Int32) ? Val(Int32) : Val(Int64); strict = strict_inequality[2], threads = threads) + revised_ends = _mark_lt_part!(inbits, _columns(dsl)[left_range_col], _columns(dsr)[right_range_cols[2]], _fl, _fr, ranges, idx, new_ends, total_length < typemax(Int32) ? Val(Int32) : Val(Int64); strict = strict_inequality[2], threads = threads) inbit_ranges = Vector{UnitRange{T}}(undef, nrow(dsl)) - + ra = Vector{UnitRange{T}}(undef, nrow(dsl)) @_threadsfor threads for i in 1:length(ranges) if i == 1 + lo = 1 lo2 = 1 else + lo = new_ends[i-1] + 1 lo2 = revised_ends[i-1] + 1 end + hi = new_ends[i] hi2 = revised_ends[i] + ra[i] = lo:hi inbit_ranges[i] = lo2:hi2 end @@ -649,12 +653,22 @@ function _join_left(dsl, dsr, ::Val{T}; onleft, onright,onright_range, makeuniqu end for j in 1:length(right_cols) - _res = allocatecol(_columns(dsr)[right_cols[j]], total_length) + if dsr isa SubDataset + _res = allocatecol(_columns(copy(dsr))[right_cols[j]], total_length, addmissing = true) + else + _res = allocatecol(_columns(dsr)[right_cols[j]], total_length, addmissing = true) + end if DataAPI.refpool(_res) !== nothing fill_val = DataAPI.invrefpool(_res)[missing] - _fill_right_cols_table_left!(_res.refs, view(DataAPI.refarray(_columns(dsr)[right_cols[j]]), idx), ranges, new_ends, total_length, fill_val; inbits = inbits, en2 = revised_ends, threads = threads) + _fill_right_cols_table_left!(_res.refs, view(DataAPI.refarray(_columns(dsr)[right_cols[j]]), idx), ranges, new_ends, total_length, fill_val; ra = ra, inbits = inbits, en2 = revised_ends, threads = threads) + # Solve the #undef problem. There has to be a better way + for i in 1:length(_res) + if !isassigned(_res,i) + _res[i] = missing + end + end else - _fill_right_cols_table_left!(_res, view(_columns(dsr)[right_cols[j]], idx), ranges, new_ends, total_length, missing; inbits = inbits, en2 = revised_ends, threads = threads) + _fill_right_cols_table_left!(_res, view(DataAPI.refarray(_columns(dsr)[right_cols[j]]), idx), ranges, new_ends, total_length, missing; ra = ra, inbits = inbits, en2 = revised_ends, threads = threads) end push!(_columns(newds), _res) new_var_name = make_unique([_names(dsl); _names(dsr)[right_cols[j]]], makeunique = makeunique)[end] @@ -675,7 +689,7 @@ function _join_left(dsl, dsr, ::Val{T}; onleft, onright,onright_range, makeuniqu if obs_id[2] obs_id_name2 = Symbol(obs_id_name, "_right") obs_id_right = allocatecol(T, total_length) - _fill_right_cols_table_left!(obs_id_right, idx, ranges, new_ends, total_length, missing; inbits = inbits, en2 = revised_ends, threads = threads) + _fill_right_cols_table_left!(obs_id_right, idx, ranges, new_ends, total_length, missing; ra = ra, inbits = inbits, en2 = revised_ends, threads = threads) insertcols!(newds, ncol(newds)+1, obs_id_name2 => obs_id_right, unsupported_copy_cols = false, makeunique = makeunique) end newds @@ -763,6 +777,7 @@ function _join_left!(dsl::Dataset, dsr::AbstractDataset, ::Val{T}; onleft, onrig inbits = nothing revised_ends = nothing + ra = nothing if length(right_range_cols) == 2 inbits = zeros(Bool, total_length) # TODO any optimisation is needed for pa? @@ -774,16 +789,20 @@ function _join_left!(dsl::Dataset, dsr::AbstractDataset, ::Val{T}; onleft, onrig if mapformats[2] _fr = getformat(dsr, right_range_cols[2]) end - revised_ends = _mark_lt_part!(inbits, _columns(dsl)[left_range_col], _columns(dsr)[right_range_cols[2]], _fl, _fr, ranges, idx, new_ends, total_length < typemax(Int32) ? Val(Int32) : Val(Int64); strict = strict_inequality[2], threads = threads) + revised_ends = _mark_lt_part!(inbits, _columns(dsl)[left_range_col], _columns(dsr)[right_range_cols[2]], _fl, _fr, ranges, idx, new_ends, total_length < typemax(Int32) ? Val(Int32) : Val(Int64); strict = strict_inequality[2], threads = threads) inbit_ranges = Vector{UnitRange{T}}(undef, nrow(dsl)) - + ra = Vector{UnitRange{T}}(undef, nrow(dsl)) @_threadsfor threads for i in 1:length(ranges) if i == 1 + lo = 1 lo2 = 1 else + lo = new_ends[i-1] + 1 lo2 = revised_ends[i-1] + 1 end + hi = new_ends[i] hi2 = revised_ends[i] + ra[i] = lo:hi inbit_ranges[i] = lo2:hi2 end @@ -805,12 +824,21 @@ function _join_left!(dsl::Dataset, dsr::AbstractDataset, ::Val{T}; onleft, onrig end for j in 1:length(right_cols) - _res = allocatecol(_columns(dsr)[right_cols[j]], total_length) + if dsr isa SubDataset + _res = allocatecol(_columns(copy(dsr))[right_cols[j]], total_length, addmissing = true) + else + _res = allocatecol(_columns(dsr)[right_cols[j]], total_length, addmissing = true) + end if DataAPI.refpool(_res) !== nothing fill_val = DataAPI.invrefpool(_res)[missing] - _fill_right_cols_table_left!(_res.refs, view(DataAPI.refarray(_columns(dsr)[right_cols[j]]), idx), ranges, new_ends, total_length, fill_val; inbits = inbits, en2 = revised_ends, threads = threads) + _fill_right_cols_table_left!(_res.refs, view(DataAPI.refarray(_columns(dsr)[right_cols[j]]), idx), ranges, new_ends, total_length, fill_val;ra=ra, inbits = inbits, en2 = revised_ends, threads = threads) + for x in 1:length(_res) + if !isassigned(_res,x) + _res[x] = missing + end + end else - _fill_right_cols_table_left!(_res, view(_columns(dsr)[right_cols[j]], idx), ranges, new_ends, total_length, missing; inbits = inbits, en2 = revised_ends, threads = threads) + _fill_right_cols_table_left!(_res, view(_columns(dsr)[right_cols[j]], idx), ranges, new_ends, total_length, missing;ra=ra, inbits = inbits, en2 = revised_ends, threads = threads) end push!(_columns(dsl), _res) new_var_name = make_unique([_names(dsl); _names(dsr)[right_cols[j]]], makeunique = makeunique)[end] @@ -829,7 +857,7 @@ function _join_left!(dsl::Dataset, dsr::AbstractDataset, ::Val{T}; onleft, onrig if obs_id[2] obs_id_name2 = Symbol(obs_id_name, "_right") obs_id_right = allocatecol(T, total_length) - _fill_right_cols_table_left!(obs_id_right, idx, ranges, new_ends, total_length, missing; inbits = inbits, en2 = revised_ends, threads = threads) + _fill_right_cols_table_left!(obs_id_right, idx, ranges, new_ends, total_length, missing;ra=ra, inbits = inbits, en2 = revised_ends, threads = threads) insertcols!(dsl, ncol(dsl)+1, obs_id_name2 => obs_id_right, unsupported_copy_cols = false, makeunique = makeunique) end diff --git a/test/join.jl b/test/join.jl index 87d47ec6..8bc74ec3 100644 --- a/test/join.jl +++ b/test/join.jl @@ -1969,7 +1969,21 @@ end @test inn_r1_a == inn_r1_t @test inn_r1_v_a == inn_r1_t - # + left_r1 = leftjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], stable = true) + left_r1_v = leftjoin(store, view(roster, :, [1,2,4,3]), on = [:store => :store, :date => (:start_date, :end_date)], stable = true) + left_r1_a = leftjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], stable = true, accelerate = true) + left_r1_v_a = leftjoin(store, view(roster, :, [1,2,4,3]), on = [:store => :store, :date => (:start_date, :end_date)], stable = true, accelerate = true) + + @test left_r1 == leftjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], stable = true, method = :hash) + @test left_r1_v == leftjoin(store, view(roster, :, [1,2,4,3]), on = [:store => :store, :date => (:start_date, :end_date)], stable = true, method = :hash) + @test left_r1_a == leftjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], stable = true, accelerate = true, method = :hash) + @test left_r1_v_a == leftjoin(store, view(roster, :, [1,2,4,3]), on = [:store => :store, :date => (:start_date, :end_date)], stable = true, accelerate = true, method = :hash) + + left_r1_t = Dataset([Union{Missing, Date}[Date("2019-10-05"), Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-02"), Date("2019-10-02"),Date("2020-01-01"), Date("2019-10-01"), Date("2019-10-02"), Date("2019-10-02"), Date("2019-10-05"), Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03")], Union{Missing, String}["A", "A", "A", "A", "A", "A", "B", "B", "A", "B", "A", "A", "B", "B", "B", "B", "B", "B", "A", "A", "A", "B", "B", "B"], Union{Missing, Int64}[3, 4, 1, 2, 3, 4, 5, 6, missing, 5, 1, 2, 7, 8, 5, 6, 7, 8, 1, 2, 3, 5, 6, 7]], ["date", "store", "employee_ID"]) + @test left_r1 == left_r1_t + @test left_r1_v == left_r1_t + @test left_r1_a == left_r1_t + @test left_r1_v_a == left_r1_t inn_r1 = innerjoin(store, roster, on = [:store => :store, :date => (:end_date, :start_date)], stable = true) inn_r1_v = innerjoin(store, view(roster, :, [1,2,4,3]), on = [:store => :store, :date => (:end_date, :start_date)], stable = true) @@ -1988,6 +2002,22 @@ end @test inn_r1_a == inn_r1_t @test inn_r1_v_a == inn_r1_t + left_r1 = leftjoin(store, roster, on = [:store => :store, :date => (:end_date, :start_date)], stable = true) + left_r1_v = leftjoin(store, view(roster, :, [1,2,4,3]), on = [:store => :store, :date => (:end_date, :start_date)], stable = true) + left_r1_a = leftjoin(store, roster, on = [:store => :store, :date => (:end_date, :start_date)], stable = true, accelerate = true) + left_r1_v_a = leftjoin(store, view(roster, :, [1,2,4,3]), on = [:store => :store, :date => (:end_date, :start_date)], stable = true, accelerate = true) + + @test left_r1 == leftjoin(store, roster, on = [:store => :store, :date => (:end_date, :start_date)], stable = true, method = :hash) + @test left_r1_v == leftjoin(store, view(roster, :, [1,2,4,3]), on = [:store => :store, :date => (:end_date, :start_date)], stable = true, method = :hash) + @test left_r1_a == leftjoin(store, roster, on = [:store => :store, :date => (:end_date, :start_date)], stable = true, accelerate = true, method = :hash) + @test left_r1_v_a == leftjoin(store, view(roster, :, [1,2,4,3]), on = [:store => :store, :date => (:end_date, :start_date)], stable = true, accelerate = true, method = :hash) + + left_r1_t = Dataset(date=Union{Missing, Date}[Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-02"), Date("2020-01-01"), Date("2019-10-01"), Date("2019-10-02"), Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-03"), Date("2019-10-03")], store=Union{Missing, String}["A", "A", "B", "A", "B", "A", "B", "B", "A", "B"], employee_ID=Union{Missing, Int}[missing,missing,missing,missing,missing,missing,missing,missing,missing,missing]) + @test left_r1 == left_r1_t + @test left_r1_v == left_r1_t + @test left_r1_a == left_r1_t + @test left_r1_v_a == left_r1_t + inn_r1 = innerjoin(store, roster, on = [:store => :store, :date => (nothing, :start_date)], stable = true) inn_r1_a = innerjoin(store, roster, on = [:store => :store, :date => (nothing, :start_date)], stable = true, accelerate = true) @@ -1998,6 +2028,17 @@ end @test inn_r1 == inn_r1_t @test inn_r1_a == inn_r1_t + + left_r1 = leftjoin(store, roster, on = [:store => :store, :date => (nothing, :start_date)], stable = true) + left_r1_a = leftjoin(store, roster, on = [:store => :store, :date => (nothing, :start_date)], stable = true, accelerate = true) + + @test left_r1 == leftjoin(store, roster, on = [:store => :store, :date => (nothing, :start_date)], stable = true, method = :hash) + @test left_r1_a == leftjoin(store, roster, on = [:store => :store, :date => (nothing, :start_date)], stable = true, accelerate = true, method = :hash) + + left_r1_t = Dataset([Union{Missing, Date}[Date("2019-10-05"),Date("2019-10-04"), Date("2019-10-02"), Date("2019-10-02"), Date("2019-10-02"), Date("2020-01-01"), Date("2019-10-01"), Date("2019-10-01"),Date("2019-10-01"), Date("2019-10-02"), Date("2019-10-02"), Date("2019-10-02"),Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03")], Union{Missing, String}["A", "A", "B", "B", "B", "A", "B", "B", "B", "A", "A", "A","B", "B", "A", "A", "B", "B"], Union{Missing, Int64}[missing,4, 6, 7, 8,missing, 6, 7, 8, 2, 3, 4, missing, 8, 3, 4, 7, 8], Union{Missing, Date}[missing, Date("2019-10-06"), Date("2019-10-04"), Date("2019-10-05"), Date("2019-10-06"), missing, Date("2019-10-04"), Date("2019-10-05"), Date("2019-10-06"), Date("2019-10-04"), Date("2019-10-05"), Date("2019-10-06"), missing, Date("2019-10-06"), Date("2019-10-05"), Date("2019-10-06"), Date("2019-10-05"), Date("2019-10-06")]], ["date", "store", "employee_ID", "end_date"]) + @test left_r1 == left_r1_t + @test left_r1_a == left_r1_t + inn_r2 = innerjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], makeunique = true, stable = true, strict_inequality = true) inn_r2_a = innerjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], makeunique = true, stable = true, strict_inequality = true, accelerate = true) @@ -2008,16 +2049,40 @@ end @test inn_r2 == inn_r2_t @test inn_r2_a == inn_r2_t + left_r2 = leftjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], makeunique = true, stable = true, strict_inequality = true) + left_r2_a = leftjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], makeunique = true, stable = true, strict_inequality = true, accelerate = true) + + @test left_r2 == leftjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], makeunique = true, stable = true, strict_inequality = true, method = :hash) + @test left_r2_a == leftjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], makeunique = true, stable = true, strict_inequality = true, accelerate = true, method = :hash) + + left_r2_t = Dataset([Union{Missing, Date}[Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-02"),Date("2020-01-01"), Date("2019-10-01"), Date("2019-10-02"), Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03")], Union{Missing, String}["A", "A", "B","A", "B", "A", "B", "B", "A", "A", "B", "B"], Union{Missing, Int64}[4, 3, 5,missing, 5, 1, 8, 7, 1, 2, 5, 6]], ["date", "store", "employee_ID"]) + @test left_r2 == left_r2_t + @test left_r2_a == left_r2_t + inn_r2 = innerjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], makeunique = true, stable = true, strict_inequality = true, droprangecols = false) @test inn_r2 == innerjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], makeunique = true, stable = true, strict_inequality = true, droprangecols = false, method = :hash) inn_r2_t = Dataset([Union{Missing, Date}[Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-02"), Date("2019-10-01"), Date("2019-10-02"), Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03")], Union{Missing, String}["A", "A", "B", "B", "A", "B", "B", "A", "A", "B", "B"], Union{Missing, Int64}[4, 3, 5, 5, 1, 8, 7, 1, 2, 5, 6], Union{Missing, Date}[Date("2019-10-04"), Date("2019-10-03"), Date("2019-09-30"), Date("2019-09-30"), Date("2019-09-30"), Date("2019-10-04"), Date("2019-10-03"), Date("2019-09-30"), Date("2019-10-02"), Date("2019-09-30"), Date("2019-10-02")], Union{Missing, Date}[Date("2019-10-06"), Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-06"), Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04")]], ["date", "store", "employee_ID", "start_date", "end_date"]) @test inn_r2 == inn_r2_t + + left_r2 = leftjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], makeunique = true, stable = true, strict_inequality = true, droprangecols = false) + @test left_r2 == leftjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], makeunique = true, stable = true, strict_inequality = true, droprangecols = false, method = :hash) + + left_r2_t = Dataset([Union{Missing, Date}[Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-02"),Date("2020-01-01"), Date("2019-10-01"), Date("2019-10-02"), Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03")], Union{Missing, String}["A", "A", "B","A", "B", "A", "B", "B", "A", "A", "B", "B"], Union{Missing, Int64}[4, 3, 5,missing, 5, 1, 8, 7, 1, 2, 5, 6], Union{Missing, Date}[Date("2019-10-04"), Date("2019-10-03"), Date("2019-09-30"),missing, Date("2019-09-30"), Date("2019-09-30"), Date("2019-10-04"), Date("2019-10-03"), Date("2019-09-30"), Date("2019-10-02"), Date("2019-09-30"), Date("2019-10-02")], Union{Missing, Date}[Date("2019-10-06"), Date("2019-10-05"), Date("2019-10-04"),missing, Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-06"), Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04")]], ["date", "store", "employee_ID", "start_date", "end_date"]) + @test left_r2 == left_r2_t + inn_r2 = innerjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], makeunique = true, stable = true, strict_inequality = [true, false], droprangecols = true) @test inn_r2 == innerjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], makeunique = true, stable = true, strict_inequality = [true, false], droprangecols = true, method = :hash) inn_r2_t = Dataset([Union{Missing, Date}[Date("2019-10-05"), Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-02"), Date("2019-10-01"), Date("2019-10-02"), Date("2019-10-05"), Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03")], Union{Missing, String}["A", "A", "A", "A", "A", "B", "B", "A", "B", "B", "B", "B", "B", "A", "A", "B", "B"], Union{Missing, Int64}[3, 4, 1, 2, 3, 5, 5, 1, 7, 8, 5, 6, 7, 1, 2, 5, 6]],["date", "store", "employee_ID"]) @test inn_r2 == inn_r2_t + + left_r2 = leftjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], makeunique = true, stable = true, strict_inequality = [true, false], droprangecols = true) + @test left_r2 == leftjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], makeunique = true, stable = true, strict_inequality = [true, false], droprangecols = true, method = :hash) + + left_r2_t = Dataset([Union{Missing, Date}[Date("2019-10-05"), Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-02"),Date("2020-01-01"), Date("2019-10-01"), Date("2019-10-02"), Date("2019-10-05"), Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03")], Union{Missing, String}["A", "A", "A", "A", "A", "B", "A", "B", "A", "B", "B", "B", "B", "B", "A", "A", "B", "B"], Union{Missing, Int64}[3, 4, 1, 2, 3, 5, missing, 5, 1, 7, 8, 5, 6, 7, 1, 2, 5, 6]],["date", "store", "employee_ID"]) + @test left_r2 == left_r2_t + push!(roster, ["C", 9, Date(2020), Date(2020)]) inn_r1 = innerjoin(store, roster, on = [:store => :store, :date => (nothing, :start_date)], stable = true) @test inn_r1 == innerjoin(store, roster, on = [:store => :store, :date => (nothing, :start_date)], stable = true, method = :hash) @@ -2025,6 +2090,12 @@ end inn_r1_t = Dataset([Union{Missing, Date}[Date("2019-10-04"), Date("2019-10-02"), Date("2019-10-02"), Date("2019-10-02"), Date("2019-10-01"), Date("2019-10-01"), Date("2019-10-01"), Date("2019-10-02"), Date("2019-10-02"), Date("2019-10-02"), Date("2019-10-04"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03")], Union{Missing, String}["A", "B", "B", "B", "B", "B", "B", "A", "A", "A", "B", "A", "A", "B", "B"], Union{Missing, Int64}[4, 6, 7, 8, 6, 7, 8, 2, 3, 4, 8, 3, 4, 7, 8], Union{Missing, Date}[Date("2019-10-06"), Date("2019-10-04"), Date("2019-10-05"), Date("2019-10-06"), Date("2019-10-04"), Date("2019-10-05"), Date("2019-10-06"), Date("2019-10-04"), Date("2019-10-05"), Date("2019-10-06"), Date("2019-10-06"), Date("2019-10-05"), Date("2019-10-06"), Date("2019-10-05"), Date("2019-10-06")]], ["date", "store", "employee_ID", "end_date"]) @test inn_r1 == inn_r1_t + left_r1 = leftjoin(store, roster, on = [:store => :store, :date => (nothing, :start_date)], stable = true) + @test left_r1 == leftjoin(store, roster, on = [:store => :store, :date => (nothing, :start_date)], stable = true, method = :hash) + + left_r1_t = Dataset([Union{Missing, Date}[Date("2019-10-05"),Date("2019-10-04"), Date("2019-10-02"), Date("2019-10-02"), Date("2019-10-02"), Date("2020-01-01"), Date("2019-10-01"), Date("2019-10-01"),Date("2019-10-01"), Date("2019-10-02"), Date("2019-10-02"), Date("2019-10-02"),Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03")], Union{Missing, String}["A", "A", "B", "B", "B", "A", "B", "B", "B", "A", "A", "A","B", "B", "A", "A", "B", "B"], Union{Missing, Int64}[missing,4, 6, 7, 8,missing, 6, 7, 8, 2, 3, 4, missing, 8, 3, 4, 7, 8], Union{Missing, Date}[missing, Date("2019-10-06"), Date("2019-10-04"), Date("2019-10-05"), Date("2019-10-06"), missing, Date("2019-10-04"), Date("2019-10-05"), Date("2019-10-06"), Date("2019-10-04"), Date("2019-10-05"), Date("2019-10-06"), missing, Date("2019-10-06"), Date("2019-10-05"), Date("2019-10-06"), Date("2019-10-05"), Date("2019-10-06")]], ["date", "store", "employee_ID", "end_date"]) + @test left_r1 == left_r1_t + roster[4,3] = missing roster[6,4] = missing roster[8,3:4] .= missing @@ -2039,6 +2110,15 @@ end @test inn_r1 == inn_r1_t @test inn_r1_a == inn_r1_t + left_r1 = leftjoin(store, roster, on = [:store => :store, :date => (nothing, :start_date)], stable = true) + left_r1_a = leftjoin(store, roster, on = [:store => :store, :date => (nothing, :start_date)], stable = true, accelerate = true) + @test left_r1 == leftjoin(store, roster, on = [:store => :store, :date => (nothing, :start_date)], stable = true, method = :hash) + @test left_r1_a == leftjoin(store, roster, on = [:store => :store, :date => (nothing, :start_date)], stable = true, accelerate = true, method = :hash) + + left_r1_t = Dataset([Union{Missing, Date}[Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-02"), Date("2019-10-02"), Date("2019-10-02"), Date("2020-01-01"), Date("2019-10-01"), Date("2019-10-01"), Date("2019-10-01"), Date("2019-10-02"), Date("2019-10-02"), Date("2019-10-02"), Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03")], Union{Missing, String}["A", "A", "A", "B", "B", "B", "A", "B", "B", "B", "A", "A", "A", "B", "B", "B", "A", "A", "A", "B", "B"], Union{Missing, Int64}[2, 4, 2, 6, 8, 7, 2, 6, 8, 7, 3, 4, 2, 7, 8, 7, 3, 4, 2, 8, 7], Union{Missing, Date}[Date("2019-10-04"), Date("2019-10-06"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-06"), missing, Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-06"), missing, missing, Date("2019-10-06"), Date("2019-10-04"), missing, Date("2019-10-06"), missing, missing, Date("2019-10-06"), Date("2019-10-04"), Date("2019-10-06"), missing]], ["date", "store", "employee_ID", "end_date"]) + @test left_r1 == left_r1_t + @test left_r1_a == left_r1_t + inn_r1 = innerjoin(store, roster, on = [:store => :store, :date => (:end_date, :start_date)], stable = true) inn_r1_a = innerjoin(store, roster, on = [:store => :store, :date => (:end_date, :start_date)], stable = true, accelerate = true) @@ -2048,33 +2128,70 @@ end inn_r1_t = Dataset([Union{Missing, Date}[Date("2019-10-05"), Date("2019-10-04"), Date("2020-01-01")], Union{Missing, String}["A", "A", "A"], Union{Missing, Int64}[2, 2, 2]], ["date", "store", "employee_ID"]) @test inn_r1 == inn_r1_t @test inn_r1_a == inn_r1_t + + left_r1 = leftjoin(store, roster, on = [:store => :store, :date => (:end_date, :start_date)], stable = true) + left_r1_a = leftjoin(store, roster, on = [:store => :store, :date => (:end_date, :start_date)], stable = true, accelerate = true) + + @test left_r1 == leftjoin(store, roster, on = [:store => :store, :date => (:end_date, :start_date)], stable = true, method = :hash) + @test left_r1_a == leftjoin(store, roster, on = [:store => :store, :date => (:end_date, :start_date)], stable = true, accelerate = true, method = :hash) + + left_r1_t = Dataset([Union{Missing, Date}[Date("2019-10-05"), Date("2019-10-04"),Date("2019-10-02"), Date("2020-01-01"),Date("2019-10-01"), Date("2019-10-02"), Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-03"), Date("2019-10-03")], Union{Missing, String}["A", "A", "B", "A", "B", "A", "B", "B", "A", "B"], Union{Missing, Int64}[2, 2,missing, 2, missing, missing, missing, missing, missing, missing]], ["date", "store", "employee_ID"]) + @test left_r1 == left_r1_t + @test left_r1_a == left_r1_t inn_r1 = innerjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], stable = true) @test inn_r1 == innerjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], stable = true, method = :hash) - inn_r1_t = Dataset([Union{Missing, Date}[Date("2019-10-05"), Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-02"), Date("2019-10-02"), Date("2020-01-01"), Date("2019-10-01"), Date("2019-10-02"), Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03")], Union{Missing, String}["A", "A", "A", "A", "A", "B", "B", "A", "B", "A", "B", "B", "B", "B", "A", "A", "B", "B"], Union{Missing, Int64}[3, 4, 1, 3, 4, 5, 6, 3, 5, 1, 8, 5, 6, 8, 1, 3, 5, 6]], ["date", "store", "employee_ID"]) @test inn_r1 == inn_r1_t + + left_r1 = leftjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], stable = true) + @test left_r1 == leftjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], stable = true, method = :hash) + left_r1_t = Dataset([Union{Missing, Date}[Date("2019-10-05"), Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-02"), Date("2019-10-02"), Date("2020-01-01"), Date("2019-10-01"), Date("2019-10-02"), Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03")], Union{Missing, String}["A", "A", "A", "A", "A", "B", "B", "A", "B", "A", "B", "B", "B", "B", "A", "A", "B", "B"], Union{Missing, Int64}[3, 4, 1, 3, 4, 5, 6, 3, 5, 1, 8, 5, 6, 8, 1, 3, 5, 6]], ["date", "store", "employee_ID"]) + @test left_r1 == left_r1_t + MONTH(x) = month(x) MONTH(::Missing) = missing setformat!(store, 1=>MONTH) setformat!(roster, r"date"=>MONTH) + inn_r3 = innerjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], droprangecols = false, strict_inequality = [true, false]) @test inn_r3 == innerjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], droprangecols = false, strict_inequality = [true, false], method = :hash) inn_r3_t = Dataset([Union{Missing, Date}[Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-02"), Date("2019-10-01"), Date("2019-10-02"), Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-03"), Date("2019-10-03")], Union{Missing, String}["A", "A", "B", "B", "A", "B", "B", "A", "B"], Union{Missing, Int64}[1, 1, 5, 5, 1, 5, 5, 1, 5], Union{Missing, Date}[Date("2019-09-30"), Date("2019-09-30"), Date("2019-09-30"), Date("2019-09-30"), Date("2019-09-30"), Date("2019-09-30"), Date("2019-09-30"), Date("2019-09-30"), Date("2019-09-30")], Union{Missing, Date}[Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04")]], ["date", "store", "employee_ID", "start_date", "end_date"]) @test inn_r3 == inn_r3_t + + left_r3 = leftjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], droprangecols = false, strict_inequality = [true, false]) + @test left_r3 == leftjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], droprangecols = false, strict_inequality = [true, false], method = :hash) + + left_r3_t = Dataset([Union{Missing, Date}[Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-02"),Date("2020-01-01"), Date("2019-10-01"), Date("2019-10-02"), Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-03"), Date("2019-10-03")], Union{Missing, String}["A", "A", "B","A", "B", "A", "B", "B", "A", "B"], Union{Missing, Int64}[1, 1, 5,missing, 5, 1, 5, 5, 1, 5], Union{Missing, Date}[Date("2019-09-30"), Date("2019-09-30"), Date("2019-09-30"),missing, Date("2019-09-30"), Date("2019-09-30"), Date("2019-09-30"), Date("2019-09-30"), Date("2019-09-30"), Date("2019-09-30")], Union{Missing, Date}[Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"),missing, Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04")]], ["date", "store", "employee_ID", "start_date", "end_date"]) + @test left_r3 == left_r3_t + inn_r3 = innerjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], droprangecols = false, strict_inequality = [false, true]) @test inn_r3 == innerjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], droprangecols = false, strict_inequality = [false, true], method = :hash) inn_r3_t = Dataset([Union{Missing, Date}[Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-02"), Date("2019-10-03")], Union{Missing, String}["A", "A", "A", "A"], Union{Missing, Int64}[3, 3, 3, 3], Union{Missing, Date}[Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03")], Union{Missing, Date}[missing, missing, missing, missing]], ["date", "store", "employee_ID", "start_date", "end_date"]) @test inn_r3 == inn_r3_t + + left_r3 = leftjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], droprangecols = false, strict_inequality = [false, true]) + @test left_r3 == leftjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], droprangecols = false, strict_inequality = [false, true], method = :hash) + + left_r3_t = Dataset([Union{Missing, Date}[Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-02"), Date("2020-01-01"), Date("2019-10-01"), Date("2019-10-02"), Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-03"), Date("2019-10-03")], Union{Missing, String}["A", "A","B","A","B", "A","B","B", "A","B"], Union{Missing, Int64}[3, 3,missing,missing,missing, 3,missing,missing, 3,missing], Union{Missing, Date}[Date("2019-10-03"), Date("2019-10-03"),missing,missing,missing, Date("2019-10-03"),missing,missing, Date("2019-10-03"),missing], Union{Missing, Date}[missing, missing, missing, missing,missing,missing,missing,missing,missing,missing]], ["date", "store", "employee_ID", "start_date", "end_date"]) + @test left_r3 == left_r3_t + inn_r3 = innerjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], droprangecols = false, strict_inequality = [true, true]) @test inn_r3 == innerjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], droprangecols = false, strict_inequality = [true, true], method = :hash) inn_r3_t = Dataset([Union{Missing, Date}[], Union{Missing, String}[], Union{Missing, Int64}[], Union{Missing, Date}[], Union{Missing, Date}[]], ["date", "store", "employee_ID", "start_date", "end_date"]) @test inn_r3 == inn_r3_t + left_r3 = leftjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], droprangecols = false, strict_inequality = [true, true]) + @test left_r3 == leftjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], droprangecols = false, strict_inequality = [true, true], method = :hash) + + left_r3_t = Dataset([Union{Missing, Date}[Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-02"), Date("2020-01-01"), Date("2019-10-01"), Date("2019-10-02"), Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-03"), Date("2019-10-03")], Union{Missing, String}["A", "A","B","A","B", "A","B","B", "A","B"], Union{Missing, Int64}[missing, missing,missing,missing,missing, missing,missing,missing, missing,missing], Union{Missing, Date}[missing, missing,missing,missing,missing, missing,missing,missing, missing,missing], Union{Missing, Date}[missing, missing, missing, missing,missing,missing,missing,missing,missing,missing]], ["date", "store", "employee_ID", "start_date", "end_date"]) + @test left_r3 == left_r3_t + + inn_r1 = innerjoin(store, roster, on = [:store => :store, :date => (nothing, :start_date)], stable = true, mapformats = false) inn_r1_v = innerjoin(store, view(roster, :, [1,2,4,3]), on = [:store => :store, :date => (nothing, :start_date)], stable = true, mapformats = false) inn_r1_a = innerjoin(store, roster, on = [:store => :store, :date => (nothing, :start_date)], stable = true, mapformats = false, accelerate = true) @@ -2092,6 +2209,25 @@ end @test inn_r1_a == inn_r1_t @test inn_r1_v_a == inn_r1_t + + left_r1 = leftjoin(store, roster, on = [:store => :store, :date => (nothing, :start_date)], stable = true, mapformats = false) + left_r1_v = leftjoin(store, view(roster, :, [1,2,4,3]), on = [:store => :store, :date => (nothing, :start_date)], stable = true, mapformats = false) + left_r1_a = leftjoin(store, roster, on = [:store => :store, :date => (nothing, :start_date)], stable = true, mapformats = false, accelerate = true) + left_r1_v_a = leftjoin(store, view(roster, :, [1,2,4,3]), on = [:store => :store, :date => (nothing, :start_date)], stable = true, mapformats = false, accelerate = true) + + @test left_r1 == leftjoin(store, roster, on = [:store => :store, :date => (nothing, :start_date)], stable = true, mapformats = false, method = :hash) + @test left_r1_v == leftjoin(store, view(roster, :, [1,2,4,3]), on = [:store => :store, :date => (nothing, :start_date)], stable = true, mapformats = false, method = :hash) + @test left_r1_a == leftjoin(store, roster, on = [:store => :store, :date => (nothing, :start_date)], stable = true, mapformats = false, accelerate = true, method = :hash) + @test left_r1_v_a == leftjoin(store, view(roster, :, [1,2,4,3]), on = [:store => :store, :date => (nothing, :start_date)], stable = true, mapformats = false, accelerate = true, method = :hash) + + + left_r1_t = Dataset([Union{Missing, Date}[Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-02"), Date("2019-10-02"), Date("2019-10-02"), Date("2020-01-01"), Date("2019-10-01"), Date("2019-10-01"), Date("2019-10-01"), Date("2019-10-02"), Date("2019-10-02"), Date("2019-10-02"), Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03")], Union{Missing, String}["A", "A", "A", "B", "B", "B", "A", "B", "B", "B", "A", "A", "A", "B", "B", "B", "A", "A", "A", "B", "B"], Union{Missing, Int64}[2, 4, 2, 6, 8, 7, 2, 6, 8, 7, 3, 4, 2, 7, 8, 7, 3, 4, 2, 8, 7], Union{Missing, Date}[Date("2019-10-04"), Date("2019-10-06"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-06"), missing, Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-06"), missing, missing, Date("2019-10-06"), Date("2019-10-04"), missing, Date("2019-10-06"), missing, missing, Date("2019-10-06"), Date("2019-10-04"), Date("2019-10-06"), missing]], ["date", "store", "employee_ID", "end_date"]) + @test left_r1 == left_r1_t + @test left_r1_v == left_r1_t + @test left_r1_a == left_r1_t + @test left_r1_v_a == left_r1_t + + inn_r1 = innerjoin(store, roster, on = [:store => :store, :date => (:end_date, :start_date)], stable = true, mapformats = false) inn_r1_v = innerjoin(store, view(roster, :, [1,2,4,3]), on = [:store => :store, :date => (:end_date, :start_date)], stable = true, mapformats = false) inn_r1_a = innerjoin(store, roster, on = [:store => :store, :date => (:end_date, :start_date)], stable = true, mapformats = false, accelerate = true) @@ -2108,6 +2244,23 @@ end @test inn_r1_a == inn_r1_t @test inn_r1_v_a == inn_r1_t + left_r1 = leftjoin(store, roster, on = [:store => :store, :date => (:end_date, :start_date)], stable = true, mapformats = false) + left_r1_v = leftjoin(store, view(roster, :, [1,2,4,3]), on = [:store => :store, :date => (:end_date, :start_date)], stable = true, mapformats = false) + left_r1_a = leftjoin(store, roster, on = [:store => :store, :date => (:end_date, :start_date)], stable = true, mapformats = false, accelerate = true) + left_r1_v_a = leftjoin(store, view(roster, :, [1,2,4,3]), on = [:store => :store, :date => (:end_date, :start_date)], stable = true, mapformats = false, accelerate = true) + + @test left_r1 == leftjoin(store, roster, on = [:store => :store, :date => (:end_date, :start_date)], stable = true, mapformats = false, method = :hash) + @test left_r1_v == leftjoin(store, view(roster, :, [1,2,4,3]), on = [:store => :store, :date => (:end_date, :start_date)], stable = true, mapformats = false, method = :hash) + @test left_r1_a == leftjoin(store, roster, on = [:store => :store, :date => (:end_date, :start_date)], stable = true, mapformats = false, accelerate = true, method = :hash) + @test left_r1_v_a == leftjoin(store, view(roster, :, [1,2,4,3]), on = [:store => :store, :date => (:end_date, :start_date)], stable = true, mapformats = false, accelerate = true, method = :hash) + + + left_r1_t = Dataset([Union{Missing, Date}[Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-02"), Date("2020-01-01"), Date("2019-10-01"), Date("2019-10-02"), Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-03"), Date("2019-10-03")], Union{Missing, String}["A", "A", "B", "A", "B", "A", "B", "B", "A", "B"], Union{Missing, Int64}[2, 2,missing, 2, missing, missing, missing, missing, missing, missing]], ["date", "store", "employee_ID"]) + @test left_r1 == left_r1_t + @test left_r1_v == left_r1_t + @test left_r1_a == left_r1_t + @test left_r1_v_a == left_r1_t + inn_r1 = innerjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], stable = true, mapformats = false) inn_r1_v = innerjoin(store, view(roster, :, [1,2, 4,3]), on = [:store => :store, :date => (:start_date, :end_date)], stable = true, mapformats = false) inn_r1_a = innerjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], stable = true, mapformats = false, accelerate = true) @@ -2124,6 +2277,22 @@ end @test inn_r1_a == inn_r1_t @test inn_r1_v_a == inn_r1_t + left_r1 = leftjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], stable = true, mapformats = false) + left_r1_v = leftjoin(store, view(roster, :, [1,2,4,3]), on = [:store => :store, :date => (:start_date, :end_date)], stable = true, mapformats = false) + left_r1_a = leftjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], stable = true, mapformats = false, accelerate = true) + left_r1_v_a = leftjoin(store, view(roster, :, [1,2,4,3]), on = [:store => :store, :date => (:start_date, :end_date)], stable = true, mapformats = false, accelerate = true) + + @test left_r1 == leftjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], stable = true, mapformats = false, method = :hash) + @test left_r1_v == leftjoin(store, view(roster, :, [1,2,4,3]), on = [:store => :store, :date => (:start_date, :end_date)], stable = true, mapformats = false, method = :hash) + @test left_r1_a == leftjoin(store, roster, on = [:store => :store, :date => (:start_date, :end_date)], stable = true, mapformats = false, accelerate = true, method = :hash) + @test left_r1_v_a == leftjoin(store, view(roster, :, [1,2,4,3]), on = [:store => :store, :date => (:start_date, :end_date)], stable = true, mapformats = false, accelerate = true, method = :hash) + + + left_r1_t = Dataset([Union{Missing, Date}[Date("2019-10-05"), Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-02"), Date("2019-10-02"), Date("2020-01-01"), Date("2019-10-01"), Date("2019-10-02"), Date("2019-10-05"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-04"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03"), Date("2019-10-03")], Union{Missing, String}["A", "A", "A", "A", "A", "B", "B", "A", "B", "A", "B", "B", "B", "B", "A", "A", "B", "B"], Union{Missing, Int64}[3, 4, 1, 3, 4, 5, 6, 3, 5, 1, 8, 5, 6, 8, 1, 3, 5, 6]], ["date", "store", "employee_ID"]) + @test left_r1 == left_r1_t + @test left_r1_v == left_r1_t + @test left_r1_a == left_r1_t + @test left_r1_v_a == left_r1_t dsl = Dataset(x = [1,2,1,2], y = PooledArray([1.0, 5.0, 2.0, 1.0])) dsr = Dataset(x = [2,1,2], y1 = PooledArray([0, -1,1]), y2 = PooledArray([5,2,2]), z=[111,222,333]) @@ -2137,6 +2306,17 @@ end @test innerjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, :y2)], method = :hash, droprangecols = false, strict_inequality = true) == Dataset(x = [1,2], y = [1.0,1], y1 = [-1,0], y2 = [2,5], z= [222,111]) @test innerjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, nothing)], method = :hash, droprangecols = false) == Dataset(x = [1,2,2,1,2,2], y=[1.0, 5,5,2,1,1], y1 = [-1,0,1,-1,0,1], y2=[2,5,2,2,5,2], z = [222,111,333,222,111,333]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], droprangecols = false) == Dataset(x = [1,2,1,2,2], y = [1.0, 5,2,1,1], y1 = [-1,0,-1,1,0], y2 = [2,5,2,2,5], z= [222,111,222,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2,2], y = [1.0,5,2,1,1], y1 = [-1,missing,missing,1,0], y2 = [2,missing,missing,2,5], z= [222,missing,missing,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, :y2)], droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2], y = [1.0,5,2,1], y1 = [-1,missing,missing,0], y2 = [2,missing,missing,5], z= [222,missing,missing,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, nothing)], droprangecols = false) == Dataset(x = [1,2,2,1,2,2], y=[1.0, 5,5,2,1,1], y1 = [-1,0,1,-1,0,1], y2=[2,5,2,2,5,2], z = [222,111,333,222,111,333]) + + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], method = :hash, droprangecols = false) == Dataset(x = [1,2,1,2,2], y = [1.0, 5,2,1,1], y1 = [-1,0,-1,1,0], y2 = [2,5,2,2,5], z= [222,111,222,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], method = :hash, droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2,2], y = [1.0,5,2,1,1], y1 = [-1,missing,missing,1,0], y2 = [2,missing,missing,2,5], z= [222,missing,missing,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, :y2)], method = :hash, droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2], y = [1.0,5,2,1], y1 = [-1,missing,missing,0], y2 = [2,missing,missing,5], z= [222,missing,missing,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, nothing)], method = :hash, droprangecols = false) == Dataset(x = [1,2,2,1,2,2], y=[1.0, 5,5,2,1,1], y1 = [-1,0,1,-1,0,1], y2=[2,5,2,2,5,2], z = [222,111,333,222,111,333]) + + dsl = Dataset(x = [1,2,1,2], y = ([1.0, 5.0, 2.0, 1.0])) dsr = Dataset(x = [2,1,2], y1 = PooledArray([0, -1,1]), y2 = PooledArray([5,2,2]), z=[111,222,333]) @test innerjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], droprangecols = false) == Dataset(x = [1,2,1,2,2], y = [1.0, 5,2,1,1], y1 = [-1,0,-1,1,0], y2 = [2,5,2,2,5], z= [222,111,222,333,111]) @@ -2149,6 +2329,16 @@ end @test innerjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, :y2)], method = :hash, droprangecols = false, strict_inequality = true) == Dataset(x = [1,2], y = [1.0,1], y1 = [-1,0], y2 = [2,5], z= [222,111]) @test innerjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, nothing)], method = :hash, droprangecols = false) == Dataset(x = [1,2,2,1,2,2], y=[1.0, 5,5,2,1,1], y1 = [-1,0,1,-1,0,1], y2=[2,5,2,2,5,2], z = [222,111,333,222,111,333]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], droprangecols = false) == Dataset(x = [1,2,1,2,2], y = [1.0, 5,2,1,1], y1 = [-1,0,-1,1,0], y2 = [2,5,2,2,5], z= [222,111,222,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2,2], y = [1.0,5,2,1,1], y1 = [-1,missing,missing,1,0], y2 = [2,missing,missing,2,5], z= [222,missing,missing,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, :y2)], droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2], y = [1.0,5,2,1], y1 = [-1,missing,missing,0], y2 = [2,missing,missing,5], z= [222,missing,missing,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, nothing)], droprangecols = false) == Dataset(x = [1,2,2,1,2,2], y=[1.0, 5,5,2,1,1], y1 = [-1,0,1,-1,0,1], y2=[2,5,2,2,5,2], z = [222,111,333,222,111,333]) + + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], method = :hash, droprangecols = false) == Dataset(x = [1,2,1,2,2], y = [1.0, 5,2,1,1], y1 = [-1,0,-1,1,0], y2 = [2,5,2,2,5], z= [222,111,222,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], method = :hash, droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2,2], y = [1.0,5,2,1,1], y1 = [-1,missing,missing,1,0], y2 = [2,missing,missing,2,5], z= [222,missing,missing,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, :y2)], method = :hash, droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2], y = [1.0,5,2,1], y1 = [-1,missing,missing,0], y2 = [2,missing,missing,5], z= [222,missing,missing,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, nothing)], method = :hash, droprangecols = false) == Dataset(x = [1,2,2,1,2,2], y=[1.0, 5,5,2,1,1], y1 = [-1,0,1,-1,0,1], y2=[2,5,2,2,5,2], z = [222,111,333,222,111,333]) + dsl = Dataset(x = [1,2,1,2], y = PooledArray([1.0, 5.0, 2.0, 1.0])) dsr = Dataset(x = [2,1,2], y1 = ([0, -1,1]), y2 = PooledArray([5,2,2]), z=[111,222,333]) @test innerjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], droprangecols = false) == Dataset(x = [1,2,1,2,2], y = [1.0, 5,2,1,1], y1 = [-1,0,-1,1,0], y2 = [2,5,2,2,5], z= [222,111,222,333,111]) @@ -2161,6 +2351,16 @@ end @test innerjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, :y2)], method = :hash, droprangecols = false, strict_inequality = true) == Dataset(x = [1,2], y = [1.0,1], y1 = [-1,0], y2 = [2,5], z= [222,111]) @test innerjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, nothing)], method = :hash, droprangecols = false) == Dataset(x = [1,2,2,1,2,2], y=[1.0, 5,5,2,1,1], y1 = [-1,0,1,-1,0,1], y2=[2,5,2,2,5,2], z = [222,111,333,222,111,333]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], droprangecols = false) == Dataset(x = [1,2,1,2,2], y = [1.0, 5,2,1,1], y1 = [-1,0,-1,1,0], y2 = [2,5,2,2,5], z= [222,111,222,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2,2], y = [1.0,5,2,1,1], y1 = [-1,missing,missing,1,0], y2 = [2,missing,missing,2,5], z= [222,missing,missing,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, :y2)], droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2], y = [1.0,5,2,1], y1 = [-1,missing,missing,0], y2 = [2,missing,missing,5], z= [222,missing,missing,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, nothing)], droprangecols = false) == Dataset(x = [1,2,2,1,2,2], y=[1.0, 5,5,2,1,1], y1 = [-1,0,1,-1,0,1], y2=[2,5,2,2,5,2], z = [222,111,333,222,111,333]) + + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], method = :hash, droprangecols = false) == Dataset(x = [1,2,1,2,2], y = [1.0, 5,2,1,1], y1 = [-1,0,-1,1,0], y2 = [2,5,2,2,5], z= [222,111,222,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], method = :hash, droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2,2], y = [1.0,5,2,1,1], y1 = [-1,missing,missing,1,0], y2 = [2,missing,missing,2,5], z= [222,missing,missing,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, :y2)], method = :hash, droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2], y = [1.0,5,2,1], y1 = [-1,missing,missing,0], y2 = [2,missing,missing,5], z= [222,missing,missing,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, nothing)], method = :hash, droprangecols = false) == Dataset(x = [1,2,2,1,2,2], y=[1.0, 5,5,2,1,1], y1 = [-1,0,1,-1,0,1], y2=[2,5,2,2,5,2], z = [222,111,333,222,111,333]) + dsl = Dataset(x = [1,2,1,2], y = PooledArray([1.0, 5.0, 2.0, 1.0])) dsr = Dataset(x = [2,1,2], y1 = ([0, -1,1]), y2 = ([5,2,2]), z=[111,222,333]) @test innerjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], droprangecols = false) == Dataset(x = [1,2,1,2,2], y = [1.0, 5,2,1,1], y1 = [-1,0,-1,1,0], y2 = [2,5,2,2,5], z= [222,111,222,333,111]) @@ -2173,6 +2373,16 @@ end @test innerjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, :y2)], method = :hash, droprangecols = false, strict_inequality = true) == Dataset(x = [1,2], y = [1.0,1], y1 = [-1,0], y2 = [2,5], z= [222,111]) @test innerjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, nothing)], method = :hash, droprangecols = false) == Dataset(x = [1,2,2,1,2,2], y=[1.0, 5,5,2,1,1], y1 = [-1,0,1,-1,0,1], y2=[2,5,2,2,5,2], z = [222,111,333,222,111,333]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], droprangecols = false) == Dataset(x = [1,2,1,2,2], y = [1.0, 5,2,1,1], y1 = [-1,0,-1,1,0], y2 = [2,5,2,2,5], z= [222,111,222,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2,2], y = [1.0,5,2,1,1], y1 = [-1,missing,missing,1,0], y2 = [2,missing,missing,2,5], z= [222,missing,missing,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, :y2)], droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2], y = [1.0,5,2,1], y1 = [-1,missing,missing,0], y2 = [2,missing,missing,5], z= [222,missing,missing,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, nothing)], droprangecols = false) == Dataset(x = [1,2,2,1,2,2], y=[1.0, 5,5,2,1,1], y1 = [-1,0,1,-1,0,1], y2=[2,5,2,2,5,2], z = [222,111,333,222,111,333]) + + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], method = :hash, droprangecols = false) == Dataset(x = [1,2,1,2,2], y = [1.0, 5,2,1,1], y1 = [-1,0,-1,1,0], y2 = [2,5,2,2,5], z= [222,111,222,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], method = :hash, droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2,2], y = [1.0,5,2,1,1], y1 = [-1,missing,missing,1,0], y2 = [2,missing,missing,2,5], z= [222,missing,missing,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, :y2)], method = :hash, droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2], y = [1.0,5,2,1], y1 = [-1,missing,missing,0], y2 = [2,missing,missing,5], z= [222,missing,missing,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, nothing)], method = :hash, droprangecols = false) == Dataset(x = [1,2,2,1,2,2], y=[1.0, 5,5,2,1,1], y1 = [-1,0,1,-1,0,1], y2=[2,5,2,2,5,2], z = [222,111,333,222,111,333]) + dsl = Dataset(x = [1,2,1,2], y = ([1.0, 5.0, 2.0, 1.0])) dsr = Dataset(x = [2,1,2], y1 = ([0, -1,1]), y2 = ([5,2,2]), z=[111,222,333]) @test innerjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], droprangecols = false) == Dataset(x = [1,2,1,2,2], y = [1.0, 5,2,1,1], y1 = [-1,0,-1,1,0], y2 = [2,5,2,2,5], z= [222,111,222,333,111]) @@ -2185,6 +2395,16 @@ end @test innerjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, :y2)], method = :hash, droprangecols = false, strict_inequality = true) == Dataset(x = [1,2], y = [1.0,1], y1 = [-1,0], y2 = [2,5], z= [222,111]) @test innerjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, nothing)], method = :hash, droprangecols = false) == Dataset(x = [1,2,2,1,2,2], y=[1.0, 5,5,2,1,1], y1 = [-1,0,1,-1,0,1], y2=[2,5,2,2,5,2], z = [222,111,333,222,111,333]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], droprangecols = false) == Dataset(x = [1,2,1,2,2], y = [1.0, 5,2,1,1], y1 = [-1,0,-1,1,0], y2 = [2,5,2,2,5], z= [222,111,222,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2,2], y = [1.0,5,2,1,1], y1 = [-1,missing,missing,1,0], y2 = [2,missing,missing,2,5], z= [222,missing,missing,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, :y2)], droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2], y = [1.0,5,2,1], y1 = [-1,missing,missing,0], y2 = [2,missing,missing,5], z= [222,missing,missing,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, nothing)], droprangecols = false) == Dataset(x = [1,2,2,1,2,2], y=[1.0, 5,5,2,1,1], y1 = [-1,0,1,-1,0,1], y2=[2,5,2,2,5,2], z = [222,111,333,222,111,333]) + + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], method = :hash, droprangecols = false) == Dataset(x = [1,2,1,2,2], y = [1.0, 5,2,1,1], y1 = [-1,0,-1,1,0], y2 = [2,5,2,2,5], z= [222,111,222,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], method = :hash, droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2,2], y = [1.0,5,2,1,1], y1 = [-1,missing,missing,1,0], y2 = [2,missing,missing,2,5], z= [222,missing,missing,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, :y2)], method = :hash, droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2], y = [1.0,5,2,1], y1 = [-1,missing,missing,0], y2 = [2,missing,missing,5], z= [222,missing,missing,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, nothing)], method = :hash, droprangecols = false) == Dataset(x = [1,2,2,1,2,2], y=[1.0, 5,5,2,1,1], y1 = [-1,0,1,-1,0,1], y2=[2,5,2,2,5,2], z = [222,111,333,222,111,333]) + #views dsl1 = Dataset(x = [1,2,1,2], y = PooledArray([1.0, 5.0, 2.0, 1.0])) @@ -2201,6 +2421,16 @@ end @test innerjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, :y2)], droprangecols = false, strict_inequality = true, method = :hash) == Dataset(x = [1,2], y = [1.0,1], y1 = [-1,0], y2 = [2,5], z= [222,111]) @test innerjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, nothing)], droprangecols = false, method = :hash) == Dataset(x = [1,2,2,1,2,2], y=[1.0, 5,5,2,1,1], y1 = [-1,0,1,-1,0,1], y2=[2,5,2,2,5,2], z = [222,111,333,222,111,333]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], droprangecols = false) == Dataset(x = [1,2,1,2,2], y = [1.0, 5,2,1,1], y1 = [-1,0,-1,1,0], y2 = [2,5,2,2,5], z= [222,111,222,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2,2], y = [1.0,5,2,1,1], y1 = [-1,missing,missing,1,0], y2 = [2,missing,missing,2,5], z= [222,missing,missing,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, :y2)], droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2], y = [1.0,5,2,1], y1 = [-1,missing,missing,0], y2 = [2,missing,missing,5], z= [222,missing,missing,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, nothing)], droprangecols = false) == Dataset(x = [1,2,2,1,2,2], y=[1.0, 5,5,2,1,1], y1 = [-1,0,1,-1,0,1], y2=[2,5,2,2,5,2], z = [222,111,333,222,111,333]) + + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], method = :hash, droprangecols = false) == Dataset(x = [1,2,1,2,2], y = [1.0, 5,2,1,1], y1 = [-1,0,-1,1,0], y2 = [2,5,2,2,5], z= [222,111,222,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], method = :hash, droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2,2], y = [1.0,5,2,1,1], y1 = [-1,missing,missing,1,0], y2 = [2,missing,missing,2,5], z= [222,missing,missing,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, :y2)], method = :hash, droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2], y = [1.0,5,2,1], y1 = [-1,missing,missing,0], y2 = [2,missing,missing,5], z= [222,missing,missing,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, nothing)], method = :hash, droprangecols = false) == Dataset(x = [1,2,2,1,2,2], y=[1.0, 5,5,2,1,1], y1 = [-1,0,1,-1,0,1], y2=[2,5,2,2,5,2], z = [222,111,333,222,111,333]) + dsl1 = Dataset(x = [1,2,1,2], y = ([1.0, 5.0, 2.0, 1.0])) dsr1 = Dataset(x = [2,1,2], y1 = PooledArray([0, -1,1]), y2 = PooledArray([5,2,2]), z=[111,222,333]) dsl = view(dsl1, [1,2,3,4], [1,2]) @@ -2215,6 +2445,16 @@ end @test innerjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, :y2)], droprangecols = false, strict_inequality = true, method = :hash) == Dataset(x = [1,2], y = [1.0,1], y1 = [-1,0], y2 = [2,5], z= [222,111]) @test innerjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, nothing)], droprangecols = false, method = :hash) == Dataset(x = [1,2,2,1,2,2], y=[1.0, 5,5,2,1,1], y1 = [-1,0,1,-1,0,1], y2=[2,5,2,2,5,2], z = [222,111,333,222,111,333]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], droprangecols = false) == Dataset(x = [1,2,1,2,2], y = [1.0, 5,2,1,1], y1 = [-1,0,-1,1,0], y2 = [2,5,2,2,5], z= [222,111,222,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2,2], y = [1.0,5,2,1,1], y1 = [-1,missing,missing,1,0], y2 = [2,missing,missing,2,5], z= [222,missing,missing,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, :y2)], droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2], y = [1.0,5,2,1], y1 = [-1,missing,missing,0], y2 = [2,missing,missing,5], z= [222,missing,missing,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, nothing)], droprangecols = false) == Dataset(x = [1,2,2,1,2,2], y=[1.0, 5,5,2,1,1], y1 = [-1,0,1,-1,0,1], y2=[2,5,2,2,5,2], z = [222,111,333,222,111,333]) + + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], method = :hash, droprangecols = false) == Dataset(x = [1,2,1,2,2], y = [1.0, 5,2,1,1], y1 = [-1,0,-1,1,0], y2 = [2,5,2,2,5], z= [222,111,222,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], method = :hash, droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2,2], y = [1.0,5,2,1,1], y1 = [-1,missing,missing,1,0], y2 = [2,missing,missing,2,5], z= [222,missing,missing,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, :y2)], method = :hash, droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2], y = [1.0,5,2,1], y1 = [-1,missing,missing,0], y2 = [2,missing,missing,5], z= [222,missing,missing,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, nothing)], method = :hash, droprangecols = false) == Dataset(x = [1,2,2,1,2,2], y=[1.0, 5,5,2,1,1], y1 = [-1,0,1,-1,0,1], y2=[2,5,2,2,5,2], z = [222,111,333,222,111,333]) + dsl1 = Dataset(x = [1,2,1,2], y = PooledArray([1.0, 5.0, 2.0, 1.0])) dsr1 = Dataset(x = [2,1,2], y1 = ([0, -1,1]), y2 = PooledArray([5,2,2]), z=[111,222,333]) @test innerjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], droprangecols = false) == Dataset(x = [1,2,1,2,2], y = [1.0, 5,2,1,1], y1 = [-1,0,-1,1,0], y2 = [2,5,2,2,5], z= [222,111,222,333,111]) @@ -2227,6 +2467,16 @@ end @test innerjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, :y2)], droprangecols = false, strict_inequality = true, method = :hash) == Dataset(x = [1,2], y = [1.0,1], y1 = [-1,0], y2 = [2,5], z= [222,111]) @test innerjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, nothing)], droprangecols = false, method = :hash) == Dataset(x = [1,2,2,1,2,2], y=[1.0, 5,5,2,1,1], y1 = [-1,0,1,-1,0,1], y2=[2,5,2,2,5,2], z = [222,111,333,222,111,333]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], droprangecols = false) == Dataset(x = [1,2,1,2,2], y = [1.0, 5,2,1,1], y1 = [-1,0,-1,1,0], y2 = [2,5,2,2,5], z= [222,111,222,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2,2], y = [1.0,5,2,1,1], y1 = [-1,missing,missing,1,0], y2 = [2,missing,missing,2,5], z= [222,missing,missing,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, :y2)], droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2], y = [1.0,5,2,1], y1 = [-1,missing,missing,0], y2 = [2,missing,missing,5], z= [222,missing,missing,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, nothing)], droprangecols = false) == Dataset(x = [1,2,2,1,2,2], y=[1.0, 5,5,2,1,1], y1 = [-1,0,1,-1,0,1], y2=[2,5,2,2,5,2], z = [222,111,333,222,111,333]) + + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], method = :hash, droprangecols = false) == Dataset(x = [1,2,1,2,2], y = [1.0, 5,2,1,1], y1 = [-1,0,-1,1,0], y2 = [2,5,2,2,5], z= [222,111,222,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], method = :hash, droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2,2], y = [1.0,5,2,1,1], y1 = [-1,missing,missing,1,0], y2 = [2,missing,missing,2,5], z= [222,missing,missing,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, :y2)], method = :hash, droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2], y = [1.0,5,2,1], y1 = [-1,missing,missing,0], y2 = [2,missing,missing,5], z= [222,missing,missing,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, nothing)], method = :hash, droprangecols = false) == Dataset(x = [1,2,2,1,2,2], y=[1.0, 5,5,2,1,1], y1 = [-1,0,1,-1,0,1], y2=[2,5,2,2,5,2], z = [222,111,333,222,111,333]) + dsl1 = Dataset(x = [1,2,1,2], y = PooledArray([1.0, 5.0, 2.0, 1.0])) dsr1 = Dataset(x = [2,1,2], y1 = ([0, -1,1]), y2 = ([5,2,2]), z=[111,222,333]) dsl = view(dsl1, [1,2,3,4], [1,2]) @@ -2241,6 +2491,16 @@ end @test innerjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, :y2)], droprangecols = false, strict_inequality = true, method = :hash) == Dataset(x = [1,2], y = [1.0,1], y1 = [-1,0], y2 = [2,5], z= [222,111]) @test innerjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, nothing)], droprangecols = false, method = :hash) == Dataset(x = [1,2,2,1,2,2], y=[1.0, 5,5,2,1,1], y1 = [-1,0,1,-1,0,1], y2=[2,5,2,2,5,2], z = [222,111,333,222,111,333]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], droprangecols = false) == Dataset(x = [1,2,1,2,2], y = [1.0, 5,2,1,1], y1 = [-1,0,-1,1,0], y2 = [2,5,2,2,5], z= [222,111,222,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2,2], y = [1.0,5,2,1,1], y1 = [-1,missing,missing,1,0], y2 = [2,missing,missing,2,5], z= [222,missing,missing,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, :y2)], droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2], y = [1.0,5,2,1], y1 = [-1,missing,missing,0], y2 = [2,missing,missing,5], z= [222,missing,missing,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, nothing)], droprangecols = false) == Dataset(x = [1,2,2,1,2,2], y=[1.0, 5,5,2,1,1], y1 = [-1,0,1,-1,0,1], y2=[2,5,2,2,5,2], z = [222,111,333,222,111,333]) + + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], method = :hash, droprangecols = false) == Dataset(x = [1,2,1,2,2], y = [1.0, 5,2,1,1], y1 = [-1,0,-1,1,0], y2 = [2,5,2,2,5], z= [222,111,222,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], method = :hash, droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2,2], y = [1.0,5,2,1,1], y1 = [-1,missing,missing,1,0], y2 = [2,missing,missing,2,5], z= [222,missing,missing,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, :y2)], method = :hash, droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2], y = [1.0,5,2,1], y1 = [-1,missing,missing,0], y2 = [2,missing,missing,5], z= [222,missing,missing,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, nothing)], method = :hash, droprangecols = false) == Dataset(x = [1,2,2,1,2,2], y=[1.0, 5,5,2,1,1], y1 = [-1,0,1,-1,0,1], y2=[2,5,2,2,5,2], z = [222,111,333,222,111,333]) + dsl1 = Dataset(x = [1,2,1,2], y = ([1.0, 5.0, 2.0, 1.0])) dsr1 = Dataset(x = [2,1,2], y1 = ([0, -1,1]), y2 = ([5,2,2]), z=[111,222,333]) dsl = view(dsl1, [1,2,3,4], [1,2]) @@ -2255,6 +2515,16 @@ end @test innerjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, :y2)], droprangecols = false, strict_inequality = true, method = :hash) == Dataset(x = [1,2], y = [1.0,1], y1 = [-1,0], y2 = [2,5], z= [222,111]) @test innerjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, nothing)], droprangecols = false, method = :hash) == Dataset(x = [1,2,2,1,2,2], y=[1.0, 5,5,2,1,1], y1 = [-1,0,1,-1,0,1], y2=[2,5,2,2,5,2], z = [222,111,333,222,111,333]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], droprangecols = false) == Dataset(x = [1,2,1,2,2], y = [1.0, 5,2,1,1], y1 = [-1,0,-1,1,0], y2 = [2,5,2,2,5], z= [222,111,222,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2,2], y = [1.0,5,2,1,1], y1 = [-1,missing,missing,1,0], y2 = [2,missing,missing,2,5], z= [222,missing,missing,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, :y2)], droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2], y = [1.0,5,2,1], y1 = [-1,missing,missing,0], y2 = [2,missing,missing,5], z= [222,missing,missing,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, nothing)], droprangecols = false) == Dataset(x = [1,2,2,1,2,2], y=[1.0, 5,5,2,1,1], y1 = [-1,0,1,-1,0,1], y2=[2,5,2,2,5,2], z = [222,111,333,222,111,333]) + + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], method = :hash, droprangecols = false) == Dataset(x = [1,2,1,2,2], y = [1.0, 5,2,1,1], y1 = [-1,0,-1,1,0], y2 = [2,5,2,2,5], z= [222,111,222,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], method = :hash, droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2,2], y = [1.0,5,2,1,1], y1 = [-1,missing,missing,1,0], y2 = [2,missing,missing,2,5], z= [222,missing,missing,333,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, :y2)], method = :hash, droprangecols = false, strict_inequality = true) == Dataset(x = [1,2,1,2], y = [1.0,5,2,1], y1 = [-1,missing,missing,0], y2 = [2,missing,missing,5], z= [222,missing,missing,111]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(:y1, nothing)], method = :hash, droprangecols = false) == Dataset(x = [1,2,2,1,2,2], y=[1.0, 5,5,2,1,1], y1 = [-1,0,1,-1,0,1], y2=[2,5,2,2,5,2], z = [222,111,333,222,111,333]) + dsl1 = Dataset(x = [1,2,1,2], y = PooledArray([1.0, 5.0, 2.0, 1.0])) dsr1 = Dataset(x = [2,1,2], y1 = PooledArray([0, -1,1]), y2 = PooledArray([5,2,2]), z=[111,222,333]) @@ -2262,14 +2532,20 @@ end dsr = view(dsr1, [3,1,2,2], [4,1,3,2]) @test innerjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], droprangecols = false) == Dataset(y = [fill(1.0, 10); fill(5,2)], x = [fill(2,6);fill(1,4);fill(2,2)], z = [repeat([333,111], 3); fill(222,4); fill(111,2)], y2 = [2,5,2,5,2,5,2,2,2,2,5,5], y1 = [1,0,1,0,1,0, -1,-1,-1,-1, 0, 0]) @test innerjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], droprangecols = false, method = :hash) == Dataset(y = [fill(1.0, 10); fill(5,2)], x = [fill(2,6);fill(1,4);fill(2,2)], z = [repeat([333,111], 3); fill(222,4); fill(111,2)], y2 = [2,5,2,5,2,5,2,2,2,2,5,5], y1 = [1,0,1,0,1,0, -1,-1,-1,-1, 0, 0]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], droprangecols = false) == Dataset(y = [fill(1.0, 10); fill(5,2)], x = [fill(2,6);fill(1,4);fill(2,2)], z = [repeat([333,111], 3); fill(222,4); fill(111,2)], y2 = [2,5,2,5,2,5,2,2,2,2,5,5], y1 = [1,0,1,0,1,0, -1,-1,-1,-1, 0, 0]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], droprangecols = false, method = :hash) == Dataset(y = [fill(1.0, 10); fill(5,2)], x = [fill(2,6);fill(1,4);fill(2,2)], z = [repeat([333,111], 3); fill(222,4); fill(111,2)], y2 = [2,5,2,5,2,5,2,2,2,2,5,5], y1 = [1,0,1,0,1,0, -1,-1,-1,-1, 0, 0]) + dsr = Dataset(view(dsr1, [3,1,2,2], [4,1,3,2])) @test innerjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], droprangecols = false) == Dataset(y = [fill(1.0, 10); fill(5,2)], x = [fill(2,6);fill(1,4);fill(2,2)], z = [repeat([333,111], 3); fill(222,4); fill(111,2)], y2 = [2,5,2,5,2,5,2,2,2,2,5,5], y1 = [1,0,1,0,1,0, -1,-1,-1,-1, 0, 0]) @test innerjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], droprangecols = false, method = :hash) == Dataset(y = [fill(1.0, 10); fill(5,2)], x = [fill(2,6);fill(1,4);fill(2,2)], z = [repeat([333,111], 3); fill(222,4); fill(111,2)], y2 = [2,5,2,5,2,5,2,2,2,2,5,5], y1 = [1,0,1,0,1,0, -1,-1,-1,-1, 0, 0]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], droprangecols = false) == Dataset(y = [fill(1.0, 10); fill(5,2)], x = [fill(2,6);fill(1,4);fill(2,2)], z = [repeat([333,111], 3); fill(222,4); fill(111,2)], y2 = [2,5,2,5,2,5,2,2,2,2,5,5], y1 = [1,0,1,0,1,0, -1,-1,-1,-1, 0, 0]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], droprangecols = false, method = :hash) == Dataset(y = [fill(1.0, 10); fill(5,2)], x = [fill(2,6);fill(1,4);fill(2,2)], z = [repeat([333,111], 3); fill(222,4); fill(111,2)], y2 = [2,5,2,5,2,5,2,2,2,2,5,5], y1 = [1,0,1,0,1,0, -1,-1,-1,-1, 0, 0]) dsl = Dataset(view(dsl1, [4,4,4,1,1,2,2], [2,1])) dsr = view(dsr1, [3,1,2,2], [4,1,3,2]) @test innerjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], droprangecols = false) == Dataset(y = [fill(1.0, 10); fill(5,2)], x = [fill(2,6);fill(1,4);fill(2,2)], z = [repeat([333,111], 3); fill(222,4); fill(111,2)], y2 = [2,5,2,5,2,5,2,2,2,2,5,5], y1 = [1,0,1,0,1,0, -1,-1,-1,-1, 0, 0]) @test innerjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], droprangecols = false, method = :hash) == Dataset(y = [fill(1.0, 10); fill(5,2)], x = [fill(2,6);fill(1,4);fill(2,2)], z = [repeat([333,111], 3); fill(222,4); fill(111,2)], y2 = [2,5,2,5,2,5,2,2,2,2,5,5], y1 = [1,0,1,0,1,0, -1,-1,-1,-1, 0, 0]) - + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], droprangecols = false) == Dataset(y = [fill(1.0, 10); fill(5,2)], x = [fill(2,6);fill(1,4);fill(2,2)], z = [repeat([333,111], 3); fill(222,4); fill(111,2)], y2 = [2,5,2,5,2,5,2,2,2,2,5,5], y1 = [1,0,1,0,1,0, -1,-1,-1,-1, 0, 0]) + @test leftjoin(dsl, dsr, on = [:x=>:x, :y=>(nothing, :y2)], droprangecols = false, method = :hash) == Dataset(y = [fill(1.0, 10); fill(5,2)], x = [fill(2,6);fill(1,4);fill(2,2)], z = [repeat([333,111], 3); fill(222,4); fill(111,2)], y2 = [2,5,2,5,2,5,2,2,2,2,5,5], y1 = [1,0,1,0,1,0, -1,-1,-1,-1, 0, 0]) dsl = Dataset(rand(1:10, 10, 3), [:x1,:x2, :x3]) dsr = Dataset(rand(1:10, 4,3), [:x1, :x2, :y]) @@ -2298,6 +2574,26 @@ end @test innerjoin(view(dsl, l_ridx, l_cidx), view(dsr, r_ridx, r_cidx), on = [:x1=>:x1, :x2=>(nothing, :y)], droprangecols = false, makeunique = true, method = :hash) == innerjoin(Dataset(view(dsl, l_ridx, l_cidx)), Dataset(view(dsr, r_ridx, r_cidx)), on = [:x1=>:x1, :x2=>(nothing, :y)], droprangecols = false, makeunique = true, stable = true) @test innerjoin(view(dsl, l_ridx, l_cidx), view(dsr, r_ridx, r_cidx), on = [:x1=>:x1, :x2=>(:x2, :y)], droprangecols = false, makeunique = true, strict_inequality = true, method = :hash) == innerjoin(Dataset(view(dsl, l_ridx, l_cidx)), Dataset(view(dsr, r_ridx, r_cidx)), on = [:x1=>:x1, :x2=>(:x2, :y)], droprangecols = false, makeunique = true, strict_inequality = true, stable = true) + @test leftjoin(view(dsl, l_ridx, l_cidx), dsr, on = [:x1=>:x1, :x2=>(:x2, :y)], droprangecols = false, makeunique = true) == leftjoin(Dataset(view(dsl, l_ridx, l_cidx)), dsr, on = [:x1=>:x1, :x2=>(:x2, :y)], droprangecols = false, makeunique = true) + @test leftjoin(view(dsl, l_ridx, l_cidx), dsr, on = [:x1=>:x1, :x2=>(:x2, nothing)], droprangecols = false, makeunique = true) == leftjoin(Dataset(view(dsl, l_ridx, l_cidx)), dsr, on = [:x1=>:x1, :x2=>(:x2, nothing)], droprangecols = false, makeunique = true) + @test leftjoin(view(dsl, l_ridx, l_cidx), dsr, on = [:x1=>:x1, :x2=>(nothing, :y)], droprangecols = false, makeunique = true) == leftjoin(Dataset(view(dsl, l_ridx, l_cidx)), dsr, on = [:x1=>:x1, :x2=>(nothing, :y)], droprangecols = false, makeunique = true) + @test leftjoin(view(dsl, l_ridx, l_cidx), dsr, on = [:x1=>:x1, :x2=>(:x2, :y)], droprangecols = false, makeunique = true, strict_inequality = true) == leftjoin(Dataset(view(dsl, l_ridx, l_cidx)), dsr, on = [:x1=>:x1, :x2=>(:x2, :y)], droprangecols = false, makeunique = true, strict_inequality = true) + + @test leftjoin(view(dsl, l_ridx, l_cidx), dsr, on = [:x1=>:x1, :x2=>(:x2, :y)], droprangecols = false, makeunique = true, method = :hash) == leftjoin(Dataset(view(dsl, l_ridx, l_cidx)), dsr, on = [:x1=>:x1, :x2=>(:x2, :y)], droprangecols = false, makeunique = true, stable = true) + @test leftjoin(view(dsl, l_ridx, l_cidx), dsr, on = [:x1=>:x1, :x2=>(:x2, nothing)], droprangecols = false, makeunique = true, method = :hash) == leftjoin(Dataset(view(dsl, l_ridx, l_cidx)), dsr, on = [:x1=>:x1, :x2=>(:x2, nothing)], droprangecols = false, makeunique = true, stable = true) + @test leftjoin(view(dsl, l_ridx, l_cidx), dsr, on = [:x1=>:x1, :x2=>(nothing, :y)], droprangecols = false, makeunique = true, method = :hash) == leftjoin(Dataset(view(dsl, l_ridx, l_cidx)), dsr, on = [:x1=>:x1, :x2=>(nothing, :y)], droprangecols = false, makeunique = true, stable = true) + @test leftjoin(view(dsl, l_ridx, l_cidx), dsr, on = [:x1=>:x1, :x2=>(:x2, :y)], droprangecols = false, makeunique = true, strict_inequality = true, method = :hash) == leftjoin(Dataset(view(dsl, l_ridx, l_cidx)), dsr, on = [:x1=>:x1, :x2=>(:x2, :y)], droprangecols = false, makeunique = true, strict_inequality = true, stable=true) + + @test leftjoin(view(dsl, l_ridx, l_cidx), view(dsr, r_ridx, r_cidx), on = [:x1=>:x1, :x2=>(:x2, :y)], droprangecols = false, makeunique = true) == leftjoin(Dataset(view(dsl, l_ridx, l_cidx)), Dataset(view(dsr, r_ridx, r_cidx)), on = [:x1=>:x1, :x2=>(:x2, :y)], droprangecols = false, makeunique = true) + @test leftjoin(view(dsl, l_ridx, l_cidx), view(dsr, r_ridx, r_cidx), on = [:x1=>:x1, :x2=>(:x2, nothing)], droprangecols = false, makeunique = true) == leftjoin(Dataset(view(dsl, l_ridx, l_cidx)), Dataset(view(dsr, r_ridx, r_cidx)), on = [:x1=>:x1, :x2=>(:x2, nothing)], droprangecols = false, makeunique = true) + @test leftjoin(view(dsl, l_ridx, l_cidx), view(dsr, r_ridx, r_cidx), on = [:x1=>:x1, :x2=>(nothing, :y)], droprangecols = false, makeunique = true) == leftjoin(Dataset(view(dsl, l_ridx, l_cidx)), Dataset(view(dsr, r_ridx, r_cidx)), on = [:x1=>:x1, :x2=>(nothing, :y)], droprangecols = false, makeunique = true) + @test leftjoin(view(dsl, l_ridx, l_cidx), view(dsr, r_ridx, r_cidx), on = [:x1=>:x1, :x2=>(:x2, :y)], droprangecols = false, makeunique = true, strict_inequality = true) == leftjoin(Dataset(view(dsl, l_ridx, l_cidx)), Dataset(view(dsr, r_ridx, r_cidx)), on = [:x1=>:x1, :x2=>(:x2, :y)], droprangecols = false, makeunique = true, strict_inequality = true) + + @test leftjoin(view(dsl, l_ridx, l_cidx), view(dsr, r_ridx, r_cidx), on = [:x1=>:x1, :x2=>(:x2, :y)], droprangecols = false, makeunique = true, method = :hash) == leftjoin(Dataset(view(dsl, l_ridx, l_cidx)), Dataset(view(dsr, r_ridx, r_cidx)), on = [:x1=>:x1, :x2=>(:x2, :y)], droprangecols = false, makeunique = true, stable = true) + @test leftjoin(view(dsl, l_ridx, l_cidx), view(dsr, r_ridx, r_cidx), on = [:x1=>:x1, :x2=>(:x2, nothing)], droprangecols = false, makeunique = true, method = :hash) == leftjoin(Dataset(view(dsl, l_ridx, l_cidx)), Dataset(view(dsr, r_ridx, r_cidx)), on = [:x1=>:x1, :x2=>(:x2, nothing)], droprangecols = false, makeunique = true, stable = true) + @test leftjoin(view(dsl, l_ridx, l_cidx), view(dsr, r_ridx, r_cidx), on = [:x1=>:x1, :x2=>(nothing, :y)], droprangecols = false, makeunique = true, method = :hash) == leftjoin(Dataset(view(dsl, l_ridx, l_cidx)), Dataset(view(dsr, r_ridx, r_cidx)), on = [:x1=>:x1, :x2=>(nothing, :y)], droprangecols = false, makeunique = true, stable = true) + @test leftjoin(view(dsl, l_ridx, l_cidx), view(dsr, r_ridx, r_cidx), on = [:x1=>:x1, :x2=>(:x2, :y)], droprangecols = false, makeunique = true, strict_inequality = true, method = :hash) == leftjoin(Dataset(view(dsl, l_ridx, l_cidx)), Dataset(view(dsr, r_ridx, r_cidx)), on = [:x1=>:x1, :x2=>(:x2, :y)], droprangecols = false, makeunique = true, strict_inequality = true, stable = true) + dsl = Dataset(rand(1:10, 10, 3), [:x1,:x2, :x3]) dsr = Dataset(rand(1:10, 4,3), [:x1, :x2, :y]) for i in 1:3 @@ -2330,6 +2626,26 @@ end @test innerjoin(view(dsl, l_ridx, l_cidx), view(dsr, r_ridx, r_cidx), on = [:x1=>:x1, :x2=>(nothing, :y)], droprangecols = false, makeunique = true, method = :hash) == innerjoin(Dataset(view(dsl, l_ridx, l_cidx)), Dataset(view(dsr, r_ridx, r_cidx)), on = [:x1=>:x1, :x2=>(nothing, :y)], droprangecols = false, makeunique = true) @test innerjoin(view(dsl, l_ridx, l_cidx), view(dsr, r_ridx, r_cidx), on = [:x1=>:x1, :x2=>(:x2, :y)], droprangecols = false, makeunique = true, strict_inequality = true, method = :hash) == innerjoin(Dataset(view(dsl, l_ridx, l_cidx)), Dataset(view(dsr, r_ridx, r_cidx)), on = [:x1=>:x1, :x2=>(:x2, :y)], droprangecols = false, makeunique = true, strict_inequality = true) + @test leftjoin(view(dsl, l_ridx, l_cidx), dsr, on = [:x1=>:x1, :x2=>(:x2, :y)], droprangecols = false, makeunique = true) == leftjoin(Dataset(view(dsl, l_ridx, l_cidx)), dsr, on = [:x1=>:x1, :x2=>(:x2, :y)], droprangecols = false, makeunique = true) + @test leftjoin(view(dsl, l_ridx, l_cidx), dsr, on = [:x1=>:x1, :x2=>(:x2, nothing)], droprangecols = false, makeunique = true) == leftjoin(Dataset(view(dsl, l_ridx, l_cidx)), dsr, on = [:x1=>:x1, :x2=>(:x2, nothing)], droprangecols = false, makeunique = true) + @test leftjoin(view(dsl, l_ridx, l_cidx), dsr, on = [:x1=>:x1, :x2=>(nothing, :y)], droprangecols = false, makeunique = true) == leftjoin(Dataset(view(dsl, l_ridx, l_cidx)), dsr, on = [:x1=>:x1, :x2=>(nothing, :y)], droprangecols = false, makeunique = true) + @test leftjoin(view(dsl, l_ridx, l_cidx), dsr, on = [:x1=>:x1, :x2=>(:x2, :y)], droprangecols = false, makeunique = true, strict_inequality = true) == leftjoin(Dataset(view(dsl, l_ridx, l_cidx)), dsr, on = [:x1=>:x1, :x2=>(:x2, :y)], droprangecols = false, makeunique = true, strict_inequality = true) + + @test leftjoin(view(dsl, l_ridx, l_cidx), dsr, on = [:x1=>:x1, :x2=>(:x2, :y)], droprangecols = false, makeunique = true, method = :hash) == leftjoin(Dataset(view(dsl, l_ridx, l_cidx)), dsr, on = [:x1=>:x1, :x2=>(:x2, :y)], droprangecols = false, makeunique = true) + @test leftjoin(view(dsl, l_ridx, l_cidx), dsr, on = [:x1=>:x1, :x2=>(:x2, nothing)], droprangecols = false, makeunique = true, method = :hash) == leftjoin(Dataset(view(dsl, l_ridx, l_cidx)), dsr, on = [:x1=>:x1, :x2=>(:x2, nothing)], droprangecols = false, makeunique = true) + @test leftjoin(view(dsl, l_ridx, l_cidx), dsr, on = [:x1=>:x1, :x2=>(nothing, :y)], droprangecols = false, makeunique = true, method = :hash) == leftjoin(Dataset(view(dsl, l_ridx, l_cidx)), dsr, on = [:x1=>:x1, :x2=>(nothing, :y)], droprangecols = false, makeunique = true) + @test leftjoin(view(dsl, l_ridx, l_cidx), dsr, on = [:x1=>:x1, :x2=>(:x2, :y)], droprangecols = false, makeunique = true, strict_inequality = true, method = :hash) == leftjoin(Dataset(view(dsl, l_ridx, l_cidx)), dsr, on = [:x1=>:x1, :x2=>(:x2, :y)], droprangecols = false, makeunique = true, strict_inequality = true) + + @test leftjoin(view(dsl, l_ridx, l_cidx), view(dsr, r_ridx, r_cidx), on = [:x1=>:x1, :x2=>(:x2, :y)], droprangecols = false, makeunique = true) == leftjoin(Dataset(view(dsl, l_ridx, l_cidx)), Dataset(view(dsr, r_ridx, r_cidx)), on = [:x1=>:x1, :x2=>(:x2, :y)], droprangecols = false, makeunique = true) + @test leftjoin(view(dsl, l_ridx, l_cidx), view(dsr, r_ridx, r_cidx), on = [:x1=>:x1, :x2=>(:x2, nothing)], droprangecols = false, makeunique = true) == leftjoin(Dataset(view(dsl, l_ridx, l_cidx)), Dataset(view(dsr, r_ridx, r_cidx)), on = [:x1=>:x1, :x2=>(:x2, nothing)], droprangecols = false, makeunique = true) + @test leftjoin(view(dsl, l_ridx, l_cidx), view(dsr, r_ridx, r_cidx), on = [:x1=>:x1, :x2=>(nothing, :y)], droprangecols = false, makeunique = true) == leftjoin(Dataset(view(dsl, l_ridx, l_cidx)), Dataset(view(dsr, r_ridx, r_cidx)), on = [:x1=>:x1, :x2=>(nothing, :y)], droprangecols = false, makeunique = true) + @test leftjoin(view(dsl, l_ridx, l_cidx), view(dsr, r_ridx, r_cidx), on = [:x1=>:x1, :x2=>(:x2, :y)], droprangecols = false, makeunique = true, strict_inequality = true) == leftjoin(Dataset(view(dsl, l_ridx, l_cidx)), Dataset(view(dsr, r_ridx, r_cidx)), on = [:x1=>:x1, :x2=>(:x2, :y)], droprangecols = false, makeunique = true, strict_inequality = true) + + @test leftjoin(view(dsl, l_ridx, l_cidx), view(dsr, r_ridx, r_cidx), on = [:x1=>:x1, :x2=>(:x2, :y)], droprangecols = false, makeunique = true, method = :hash) == leftjoin(Dataset(view(dsl, l_ridx, l_cidx)), Dataset(view(dsr, r_ridx, r_cidx)), on = [:x1=>:x1, :x2=>(:x2, :y)], droprangecols = false, makeunique = true) + @test leftjoin(view(dsl, l_ridx, l_cidx), view(dsr, r_ridx, r_cidx), on = [:x1=>:x1, :x2=>(:x2, nothing)], droprangecols = false, makeunique = true, method = :hash) == leftjoin(Dataset(view(dsl, l_ridx, l_cidx)), Dataset(view(dsr, r_ridx, r_cidx)), on = [:x1=>:x1, :x2=>(:x2, nothing)], droprangecols = false, makeunique = true) + @test leftjoin(view(dsl, l_ridx, l_cidx), view(dsr, r_ridx, r_cidx), on = [:x1=>:x1, :x2=>(nothing, :y)], droprangecols = false, makeunique = true, method = :hash) == leftjoin(Dataset(view(dsl, l_ridx, l_cidx)), Dataset(view(dsr, r_ridx, r_cidx)), on = [:x1=>:x1, :x2=>(nothing, :y)], droprangecols = false, makeunique = true) + @test leftjoin(view(dsl, l_ridx, l_cidx), view(dsr, r_ridx, r_cidx), on = [:x1=>:x1, :x2=>(:x2, :y)], droprangecols = false, makeunique = true, strict_inequality = true, method = :hash) == leftjoin(Dataset(view(dsl, l_ridx, l_cidx)), Dataset(view(dsr, r_ridx, r_cidx)), on = [:x1=>:x1, :x2=>(:x2, :y)], droprangecols = false, makeunique = true, strict_inequality = true) + dsl = Dataset(x1 = [1,2,1,3], y = [-1.2,-3,2.1,-3.5]) dsr = Dataset(x1 = [1,2,3], lower = [0, -3,1], upper = [1,0,2]) @test contains(dsl, dsr, on = [1=>1, 2=>(2,3)]) == [0,1,0,0] From f61ae752eb4c53bb9f04482c773399589630925a Mon Sep 17 00:00:00 2001 From: kyo227 <43748518+kyo227@users.noreply.github.com> Date: Tue, 13 Sep 2022 16:32:16 +0800 Subject: [PATCH 6/8] Change the _join_inner_dict to _join_left_dict. --- src/join/join.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/join/join.jl b/src/join/join.jl index 39ffcdbb..210e7aab 100644 --- a/src/join/join.jl +++ b/src/join/join.jl @@ -573,7 +573,7 @@ function _join_left(dsl, dsr, ::Val{T}; onleft, onright,onright_range, makeuniqu else ranges = Vector{UnitRange{T}}(undef, nrow(dsl)) if length(oncols_left) == 1 && type == :both && nrow(dsr)>1 - success, result = _join_inner_dict(dsl, dsr, ranges, oncols_left, oncols_right, right_cols, Val(T); makeunique = makeunique, mapformats = mapformats, check = check, threads = threads, multiple_match = multiple_match, multiple_match_name = multiple_match_name, obs_id = obs_id, obs_id_name = obs_id_name) + success, result = _join_left_dict(dsl, dsr, ranges, oncols_left, oncols_right, right_cols, Val(T); makeunique = makeunique, mapformats = mapformats, check = check, threads = threads, multiple_match = multiple_match, multiple_match_name = multiple_match_name, obs_id = obs_id, obs_id_name = obs_id_name) if success return result end @@ -684,7 +684,7 @@ function _join_left(dsl, dsr, ::Val{T}; onleft, onright,onright_range, makeuniqu obs_id_name1 = Symbol(obs_id_name, "_left") obs_id_left = allocatecol(nrow(dsl) < typemax(Int32) ? Int32 : Int64, total_length) _fill_oncols_left_table_left!(obs_id_left, 1:nrow(dsl), ranges, new_ends, total_length, missing; threads = threads) - insertcols!(dsnewdsl, ncol(newds)+1, obs_id_name1 => obs_id_left, unsupported_copy_cols = false, makeunique = makeunique) + insertcols!(newds, ncol(newds)+1, obs_id_name1 => obs_id_left, unsupported_copy_cols = false, makeunique = makeunique) end if obs_id[2] obs_id_name2 = Symbol(obs_id_name, "_right") @@ -758,7 +758,7 @@ function _join_left!(dsl::Dataset, dsr::AbstractDataset, ::Val{T}; onleft, onrig else ranges = Vector{UnitRange{T}}(undef, nrow(dsl)) if length(oncols_left) == 1 && type == :both && nrow(dsr)>1 - success, result = _join_inner_dict(dsl, dsr, ranges, oncols_left, oncols_right, right_cols, Val(T); makeunique = makeunique, mapformats = mapformats, check = check, threads = threads, multiple_match = multiple_match, multiple_match_name = multiple_match_name, obs_id = obs_id, obs_id_name = obs_id_name) + success, result = _join_left_dict(dsl, dsr, ranges, oncols_left, oncols_right, right_cols, Val(T); makeunique = makeunique, mapformats = mapformats, check = check, threads = threads, multiple_match = multiple_match, multiple_match_name = multiple_match_name, obs_id = obs_id, obs_id_name = obs_id_name) if success return result end From d2a6eb76177cb73e84766381b548afc5c73203d3 Mon Sep 17 00:00:00 2001 From: kyo227 <43748518+kyo227@users.noreply.github.com> Date: Tue, 20 Sep 2022 19:19:34 +0800 Subject: [PATCH 7/8] Implement range join function in leftjoin by new_ends and inbits. Pass the test of test/join.jl. leftjoin!() may be not necessary to implement the range join function, so i didn't change it. I write a function named _ranges_join(), which can implement the leftjoin and innerjoin. _ranges_join() use the parameter join_type to distinguished the leftjoin and innerjoin. --- src/join/join.jl | 525 ++++++++++++++++------------------------------- src/join/main.jl | 4 +- 2 files changed, 179 insertions(+), 350 deletions(-) diff --git a/src/join/join.jl b/src/join/join.jl index 210e7aab..216ffdea 100644 --- a/src/join/join.jl +++ b/src/join/join.jl @@ -147,6 +147,20 @@ function _fill_val_join!(x, r2, val, inbits, r) end end +function _fill_val_join!(x, r2, val, inbits, r, ranges) + cnt = 1 + lo = r2.start + for i in r + if inbits[i] + x[cnt+lo-1] = val + cnt += 1 + end + if !inbits[i] && length(ranges) === 0 + x[cnt+lo-1] = val + end + end +end + # F1 and F2 are here for type stability when threads = false function _find_ranges_for_join!(ranges, x, y, _fl::F1, _fr::F2, ::Val{T1}, ::Val{T2}; type = :both, threads = true) where {T1, T2, F1, F2} if type == :both @@ -226,11 +240,30 @@ function _find_ranges_for_join_pa!(ranges, x, invpool, y, _fl::F1, _fr::F2, ::Va end -function _fill_oncols_left_table_left!(_res, x, ranges, en, total, fill_val; threads = true) - @_threadsfor threads for i in 1:length(x) - i == 1 ? lo = 1 : lo = en[i - 1] + 1 - hi = en[i] - _fill_val_join!(_res, lo:hi, x[i]) + +function _fill_oncols_left_table_left!(_res, x, ranges, en, total, fill_val; inbits = nothing, en2 = nothing, threads = true) + if inbits === nothing + @_threadsfor threads for i in 1:length(x) + i == 1 ? lo = 1 : lo = en[i - 1] + 1 + hi = en[i] + _fill_val_join!(_res, lo:hi, x[i]) + end + else + @_threadsfor threads for i in 1:length(x) + if i == 1 + lo = 1 + lo2 = 1 + else + lo = en[i - 1] + 1 + lo2 = en2[i-1] + 1 + end + hi = en[i] + # @show sum(view(inbits, lo:hi)) + # sum(view(inbits, lo:hi)) == 0 && continue + hi2 = en2[i] + _fill_val_join!(_res, lo2:hi2, x[i], inbits, lo:hi, ranges[i]) + + end end @_threadsfor threads for i in en[length(x)]+1:total @@ -274,7 +307,7 @@ function _fill_oncols_left_table_anti!(_res, x, ranges, en, total; threads = tru end end -function _fill_right_cols_table_left!(_res, x, ranges, en, total, fill_val; ra = nothing, inbits = nothing, en2 = nothing, threads = true) +function _fill_right_cols_table_left!(_res, x, ranges, en, total, fill_val; inbits = nothing, en2 = nothing, threads = true) if inbits === nothing @_threadsfor threads for i in 1:length(ranges) i == 1 ? lo = 1 : lo = en[i - 1] + 1 @@ -283,26 +316,16 @@ function _fill_right_cols_table_left!(_res, x, ranges, en, total, fill_val; ra = end else @_threadsfor threads for i in 1:length(ranges) - i == 1 ? lo2 = 1 : lo2 = en2[i-1] + 1 - hi2 = en2[i] - left_fill_right_col_range!(_res, lo2:hi2, x, ranges[i], en[i], ra[i], inbits) - end - end -end - -function left_fill_right_col_range!(_res, r2, x, ranges, r, ra, inbits) - cnt = 1 - cnt_r = 1 - if length(r2) == 0 - _res = allowmissing(_res) - _res[r] = missing - else - for i in ra - if inbits[i] - _res[r-length(r2)+1+cnt-1] = x[ranges[cnt_r]] - cnt += 1 + if i == 1 + lo = 1 + lo2 = 1 + else + lo = en[i - 1] + 1 + lo2 = en2[i-1] + 1 end - cnt_r += 1 + hi = en[i] + hi2 = en2[i] + _fill_right_col_range!(_res, lo2:hi2, x, ranges[i], inbits, lo:hi) end end end @@ -326,7 +349,7 @@ function _fill_right_cols_table_inner!(_res, x, ranges, en, total; inbits = noth length(ranges[i]) == 0 && continue i == 1 ? lo = 1 : lo = en[i - 1] + 1 hi = en[i] - copyto!(_res, lo, x, ranges[i].start, length(ranges[i])) + copyto!(_res, lo, x, ranges[i].start, length(ranges[i])) end else @_threadsfor threads for i in 1:length(ranges) @@ -388,9 +411,9 @@ function _create_multiple_match_col_left(ranges, en, total_length) end end end - res end + function _create_multiple_match_col_inner(ranges, en, total_length) res = allocatecol(Bool, total_length) cnt = 0 @@ -431,7 +454,6 @@ function _create_multiple_match_col_inner(ranges, en, total_length) end end end - res end @@ -443,14 +465,21 @@ ISLE(::Missing, y) = false ISLE(::Missing, ::Missing) = false -function _mark_lt_part!(inbits, x_l, x_r, _fl::F1, _fr::F2, ranges, r_perms, en, ::Val{T}; strict = false, threads = true) where {T, F1, F2} +function _mark_lt_part!(inbits, x_l, x_r, _fl::F1, _fr::F2, ranges, r_perms, en, ::Val{T}; strict = false, threads = true, join_type) where {T, F1, F2} revised_ends = zeros(T, length(en)) @_threadsfor threads for i in 1:length(ranges) - if length(ranges[i]) == 0 - if i !== 1 - revised_ends[i] = 0 + if join_type === :left + if length(ranges[i]) == 0 + revised_ends[i] = 1 + continue + end + else + if length(ranges[i]) == 0 + if i !== 1 + revised_ends[i] = 0 + end + continue end - continue end i == 1 ? lo = 1 : lo = en[i - 1] + 1 hi = en[i] @@ -468,6 +497,9 @@ function _mark_lt_part!(inbits, x_l, x_r, _fl::F1, _fr::F2, ranges, r_perms, en, revised_ends[i] = total if total == 0 ranges[i] = 1:0 + if join_type === :left + revised_ends[i] += 1 + end end end our_cumsum!(revised_ends) @@ -509,10 +541,7 @@ function _change_refpool_find_range_for_join!(ranges, dsl, dsr, r_perms, oncols_ end end - - -function _join_left(dsl, dsr, ::Val{T}; onleft, onright,onright_range, makeunique = false, mapformats = [true, true], stable = false, alg = HeapSort, check = true, accelerate = false, droprangecols = true, strict_inequality = [false, false], method = :sort, threads = true, multiple_match = false, multiple_match_name = :multiple, obs_id = [false, false], obs_id_name = :obs_id) where T - isempty(dsl) && return copy(dsl) +function _ranges_join(dsl, dsr, ::Val{T}; onleft, onright,onright_range, makeunique = false, mapformats = [true, true], stable = false,onlyreturnrange = false, alg = HeapSort, check = true, accelerate = false, droprangecols = true, strict_inequality = [false, false], method = :sort, threads = true, multiple_match = false, multiple_match_name = :multiple, obs_id = [false, false], obs_id_name = :obs_id, join_type) where T oncols_left = onleft oncols_right = onright type = :both @@ -573,7 +602,11 @@ function _join_left(dsl, dsr, ::Val{T}; onleft, onright,onright_range, makeuniqu else ranges = Vector{UnitRange{T}}(undef, nrow(dsl)) if length(oncols_left) == 1 && type == :both && nrow(dsr)>1 - success, result = _join_left_dict(dsl, dsr, ranges, oncols_left, oncols_right, right_cols, Val(T); makeunique = makeunique, mapformats = mapformats, check = check, threads = threads, multiple_match = multiple_match, multiple_match_name = multiple_match_name, obs_id = obs_id, obs_id_name = obs_id_name) + if join_type === :left + success, result = _join_left_dict(dsl, dsr, ranges, oncols_left, oncols_right, right_cols, Val(T); makeunique = makeunique, mapformats = mapformats, check = check, threads = threads, multiple_match = multiple_match, multiple_match_name = multiple_match_name, obs_id = obs_id, obs_id_name = obs_id_name) + else + success, result = _join_inner_dict(dsl, dsr, ranges, oncols_left, oncols_right, right_cols, Val(T); makeunique = makeunique, mapformats = mapformats, check = check, threads = threads, multiple_match = multiple_match, multiple_match_name = multiple_match_name, obs_id = obs_id, obs_id_name = obs_id_name) + end if success return result end @@ -585,14 +618,17 @@ function _join_left(dsl, dsr, ::Val{T}; onleft, onright,onright_range, makeuniqu end _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], length(oncols_left); type = type, nsfpaj = nsfpaj, threads = threads) end - - new_ends = map(x -> max(1, length(x)), ranges) + + if join_type === :left + new_ends = map(x -> max(length(x),1), ranges) + else + new_ends = map(length,ranges) + end our_cumsum!(new_ends) total_length = new_ends[end] inbits = nothing revised_ends = nothing - ra = nothing if length(right_range_cols) == 2 inbits = zeros(Bool, total_length) # TODO any optimisation is needed for pa? @@ -604,26 +640,16 @@ function _join_left(dsl, dsr, ::Val{T}; onleft, onright,onright_range, makeuniqu if mapformats[2] _fr = getformat(dsr, right_range_cols[2]) end - revised_ends = _mark_lt_part!(inbits, _columns(dsl)[left_range_col], _columns(dsr)[right_range_cols[2]], _fl, _fr, ranges, idx, new_ends, total_length < typemax(Int32) ? Val(Int32) : Val(Int64); strict = strict_inequality[2], threads = threads) - inbit_ranges = Vector{UnitRange{T}}(undef, nrow(dsl)) - ra = Vector{UnitRange{T}}(undef, nrow(dsl)) - @_threadsfor threads for i in 1:length(ranges) - if i == 1 - lo = 1 - lo2 = 1 - else - lo = new_ends[i-1] + 1 - lo2 = revised_ends[i-1] + 1 - end - hi = new_ends[i] - hi2 = revised_ends[i] - ra[i] = lo:hi - inbit_ranges[i] = lo2:hi2 + revised_ends = _mark_lt_part!(inbits, _columns(dsl)[left_range_col], _columns(dsr)[right_range_cols[2]], _fl, _fr, ranges, idx, new_ends, total_length < typemax(Int32) ? Val(Int32) : Val(Int64); strict = strict_inequality[2], threads = threads, join_type) + if join_type === :left + total_length = revised_ends[end] + else + total_length = sum(inbits) end - - new_ends = map(x -> max(1, length(x)), inbit_ranges) - our_cumsum!(new_ends) - total_length = new_ends[end] + end + + if onlyreturnrange + return ranges end if check @@ -631,21 +657,32 @@ function _join_left(dsl, dsr, ::Val{T}; onleft, onright,onright_range, makeuniqu end if multiple_match - multiple_match_col = _create_multiple_match_col_left(ranges,revised_ends, total_length) + if join_type === :left + multiple_match_col = _create_multiple_match_col_left(ranges,revised_ends, total_length) + else + multiple_match_col = _create_multiple_match_col_inner(ranges, revised_ends, total_length) + end end res = [] for j in 1:length(index(dsl)) _res = allocatecol(_columns(dsl)[j], total_length, addmissing = false) if DataAPI.refpool(_res) !== nothing - # fill_val = DataAPI.invrefpool(_res)[missing] - _fill_oncols_left_table_left!(_res.refs, DataAPI.refarray(_columns(dsl)[j]), ranges, new_ends, total_length, missing; threads = threads) + if join_type === :left + _fill_oncols_left_table_left!(_res.refs, DataAPI.refarray(_columns(dsl)[j]), ranges, new_ends, total_length, missing; inbits = inbits, en2 = revised_ends, threads = threads) + else + _fill_oncols_left_table_inner!(_res.refs, DataAPI.refarray(_columns(dsl)[j]), ranges, new_ends, total_length; inbits = inbits, en2 = revised_ends, threads = threads) + end else - _fill_oncols_left_table_left!(_res, _columns(dsl)[j], ranges, new_ends, total_length, missing; threads = threads) + if join_type === :left + _fill_oncols_left_table_left!(_res, _columns(dsl)[j], ranges, new_ends, total_length, missing; inbits = inbits, en2 = revised_ends, threads = threads) + else + _fill_oncols_left_table_inner!(_res, _columns(dsl)[j], ranges, new_ends, total_length; inbits = inbits, en2 = revised_ends, threads = threads) + end end push!(res, _res) end - + if dsl isa SubDataset newds = Dataset(res, copy(index(dsl)), copycols = false) else @@ -653,22 +690,35 @@ function _join_left(dsl, dsr, ::Val{T}; onleft, onright,onright_range, makeuniqu end for j in 1:length(right_cols) - if dsr isa SubDataset - _res = allocatecol(_columns(copy(dsr))[right_cols[j]], total_length, addmissing = true) + if join_type === :left + if dsr isa SubDataset + _res = allocatecol(_columns(copy(dsr))[right_cols[j]], total_length, addmissing = true) + else + _res = allocatecol(_columns(dsr)[right_cols[j]], total_length, addmissing = true) + end else - _res = allocatecol(_columns(dsr)[right_cols[j]], total_length, addmissing = true) + _res = allocatecol(_columns(dsr)[right_cols[j]], total_length, addmissing = false) end + if DataAPI.refpool(_res) !== nothing - fill_val = DataAPI.invrefpool(_res)[missing] - _fill_right_cols_table_left!(_res.refs, view(DataAPI.refarray(_columns(dsr)[right_cols[j]]), idx), ranges, new_ends, total_length, fill_val; ra = ra, inbits = inbits, en2 = revised_ends, threads = threads) - # Solve the #undef problem. There has to be a better way - for i in 1:length(_res) - if !isassigned(_res,i) - _res[i] = missing + if join_type === :left + fill_val = DataAPI.invrefpool(_res)[missing] + _fill_right_cols_table_left!(_res.refs, view(DataAPI.refarray(_columns(dsr)[right_cols[j]]), idx), ranges, new_ends, total_length, fill_val; inbits = inbits, en2 = revised_ends, threads = threads) + # Solve the #undef problem. There has to be a better way + for i in 1:length(_res) + if !isassigned(_res,i) + _res[i] = missing + end end + else + _fill_right_cols_table_inner!(_res.refs, view(DataAPI.refarray(_columns(dsr)[right_cols[j]]), idx), ranges, new_ends, total_length; inbits = inbits, en2 = revised_ends, threads = threads) end else - _fill_right_cols_table_left!(_res, view(DataAPI.refarray(_columns(dsr)[right_cols[j]]), idx), ranges, new_ends, total_length, missing; ra = ra, inbits = inbits, en2 = revised_ends, threads = threads) + if join_type === :left + _fill_right_cols_table_left!(_res, view(DataAPI.refarray(_columns(dsr)[right_cols[j]]), idx), ranges, new_ends, total_length, missing; inbits = inbits, en2 = revised_ends, threads = threads) + else + _fill_right_cols_table_inner!(_res, view(DataAPI.refarray(_columns(dsr)[right_cols[j]]), idx), ranges, new_ends, total_length; inbits = inbits, en2 = revised_ends, threads = threads) + end end push!(_columns(newds), _res) new_var_name = make_unique([_names(dsl); _names(dsr)[right_cols[j]]], makeunique = makeunique)[end] @@ -683,162 +733,93 @@ function _join_left(dsl, dsr, ::Val{T}; onleft, onright,onright_range, makeuniqu if obs_id[1] obs_id_name1 = Symbol(obs_id_name, "_left") obs_id_left = allocatecol(nrow(dsl) < typemax(Int32) ? Int32 : Int64, total_length) - _fill_oncols_left_table_left!(obs_id_left, 1:nrow(dsl), ranges, new_ends, total_length, missing; threads = threads) + if join_type === :left + _fill_oncols_left_table_left!(obs_id_left, 1:nrow(dsl), ranges, new_ends, total_length, missing; inbits = inbits, en2 = revised_ends, threads = threads) + else + _fill_oncols_left_table_inner!(obs_id_left, 1:nrow(dsl), ranges, new_ends, total_length; inbits = inbits, en2 = revised_ends, threads = threads) + end insertcols!(newds, ncol(newds)+1, obs_id_name1 => obs_id_left, unsupported_copy_cols = false, makeunique = makeunique) end if obs_id[2] obs_id_name2 = Symbol(obs_id_name, "_right") obs_id_right = allocatecol(T, total_length) - _fill_right_cols_table_left!(obs_id_right, idx, ranges, new_ends, total_length, missing; ra = ra, inbits = inbits, en2 = revised_ends, threads = threads) + if join_type === :left + _fill_right_cols_table_left!(obs_id_right, idx, ranges, new_ends, total_length, missing; inbits = inbits, en2 = revised_ends, threads = threads) + else + _fill_right_cols_table_inner!(obs_id_right, idx, ranges, new_ends, total_length; inbits = inbits, en2 = revised_ends, threads = threads) + end insertcols!(newds, ncol(newds)+1, obs_id_name2 => obs_id_right, unsupported_copy_cols = false, makeunique = makeunique) end newds - -end -function _join_left!(dsl::Dataset, dsr::AbstractDataset, ::Val{T}; onleft, onright, onright_range, makeunique = false,mapformats = [true, true], stable = false, alg = HeapSort, check = true, accelerate = false, droprangecols = true, strict_inequality = [false, false], method = :sort, threads = true, multiple_match = false, multiple_match_name = :multiple, obs_id = [false, false], obs_id_name = :obs_id) where T - isempty(dsl) && return dsl - oncols_left = onleft - oncols_right = onright - type = :both - right_range_cols = Int[] +end - if onright_range !== nothing - left_range_col = oncols_left[end] - right_range_cols = index(dsr)[filter!(!isequal(nothing), collect(onright_range))] - if droprangecols - right_cols = setdiff(1:length(index(dsr)), [oncols_right; right_range_cols]) - else - right_cols = setdiff(1:length(index(dsr)), oncols_right) - end - - oncols_right = [oncols_right; first(right_range_cols)] - if onright_range[1] !== nothing - if strict_inequality[1] - type = :leftstrict - else - type = :left - end - else - if strict_inequality[2] - type = :rightstrict - else - type = :right - end - end - else - right_cols = setdiff(1:length(index(dsr)), oncols_right) - end - if !makeunique && !isempty(intersect(_names(dsl), _names(dsr)[right_cols])) - throw(ArgumentError("duplicate column names, pass `makeunique = true` to make them unique using a suffix automatically." )) - end - nsfpaj = true - # if the columns for inequality like join are PA we cannot use the fast path - if type != :both - if any(i-> DataAPI.refpool(_columns(dsr)[i]) !== nothing, right_range_cols) - nsfpaj = false - end - end +function _join_left(dsl, dsr, ::Val{T}; onleft, onright,onright_range, makeunique = false, mapformats = [true, true], stable = false, alg = HeapSort, check = true, accelerate = false, droprangecols = true, strict_inequality = [false, false], onlyreturnrange = false, method = :sort, threads = true, multiple_match = false, multiple_match_name = :multiple, obs_id = [false, false], obs_id_name = :obs_id) where T + isempty(dsl) && return copy(dsl) + _ranges_join(dsl, dsr, nrow(dsr) < typemax(Int32) ? Val(Int32) : Val(Int64), onleft = onleft, onright = onright, onright_range = onright_range, stable = stable, onlyreturnrange = onlyreturnrange, strict_inequality = strict_inequality, makeunique = makeunique, mapformats = mapformats,accelerate = accelerate, check = check, droprangecols = droprangecols, method = method, threads = threads, multiple_match = multiple_match, multiple_match_name = multiple_match_name, obs_id = obs_id, obs_id_name = obs_id_name, join_type=:left) +end - if method == :hash && (onright_range === nothing || length(onleft) > 1) - if onright_range !== nothing - ranges, a, idx, minval, reps, sz, right_cols_2 = _find_ranges_for_join_using_hash(dsl, dsr, onleft[1:end-1], oncols_right[1:end-1], mapformats, true, Val(T); threads = threads) - filter!(!=(0), reps) - pushfirst!(reps, 1) - our_cumsum!(reps) - pop!(reps) - grng = GIVENRANGE(idx, reps, Int[], length(reps)) - starts, idx, last_valid_range = _sort_for_join_after_hash(dsr, right_range_cols[1], stable, alg, mapformats, nsfpaj, grng; threads = threads) - _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], length(oncols_left); type = type, nsfpaj = nsfpaj, threads = threads) - else - ranges, a, idx, minval, reps, sz, right_cols = _find_ranges_for_join_using_hash(dsl, dsr, onleft, onright, mapformats, makeunique, Val(T); threads = threads) +function _join_left!(dsl::Dataset, dsr::AbstractDataset, ::Val{T}; onleft, onright, makeunique = false, mapformats = [true, true], stable = false, alg = HeapSort, check = true, accelerate = false, method = :sort, threads = true, multiple_match = false, multiple_match_name = :multiple, obs_id = [false, false], obs_id_name = :obs_id) where T + isempty(dsl) && return dsl + if method == :hash + ranges, a, idx, minval, reps, sz, right_cols = _find_ranges_for_join_using_hash(dsl, dsr, onleft, onright, mapformats, makeunique, Val(T); threads = threads) + elseif method == :sort + oncols_left = onleft + oncols_right = onright + # Inverse selection of the right table + right_cols = setdiff(1:length(index(dsr)), oncols_right) + # check unique + if !makeunique && !isempty(intersect(_names(dsl), _names(dsr)[right_cols])) + throw(ArgumentError("duplicate column names, pass `makeunique = true` to make them unique using a suffix automatically." )) end - else + # init ranges ranges = Vector{UnitRange{T}}(undef, nrow(dsl)) - if length(oncols_left) == 1 && type == :both && nrow(dsr)>1 - success, result = _join_left_dict(dsl, dsr, ranges, oncols_left, oncols_right, right_cols, Val(T); makeunique = makeunique, mapformats = mapformats, check = check, threads = threads, multiple_match = multiple_match, multiple_match_name = multiple_match_name, obs_id = obs_id, obs_id_name = obs_id_name) + # if oncols_left has only one column. + if length(oncols_left) == 1 && nrow(dsr)>1 + # if the key in dsr is unique, success is true. + # for row in left, must be one to one. + success, result = _join_left!_dict(dsl, dsr, ranges, oncols_left, oncols_right, right_cols, Val(T); makeunique = makeunique, mapformats = mapformats, check = check, threads = threads, multiple_match = multiple_match, multiple_match_name = multiple_match_name, obs_id = obs_id, obs_id_name = obs_id_name) if success return result end end - idx, uniquemode = _find_permute_and_fill_range_for_join!(ranges, dsr, dsl, oncols_right, oncols_left, stable, alg, mapformats, accelerate && (onright_range == nothing || length(oncols_right)>1); nsfpaj = nsfpaj, threads = threads) - - for j in 1:length(oncols_left)-1 - _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], j; nsfpaj = nsfpaj, threads = threads) + + # idx is the order of dsr by hashed value. + # uniquemode is true, means that no repeat + idx, uniquemode = _find_permute_and_fill_range_for_join!(ranges, dsr, dsl, oncols_right, oncols_left, stable, alg, mapformats, accelerate, threads = threads) + + # This function will find ranges. if ranges has elements like 2:1, the first number is larger than second + # which means that this row in dsl has no matched row in dsr. + for j in 1:length(oncols_left) + _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], j, threads = threads) end - _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], length(oncols_left); type = type, nsfpaj = nsfpaj, threads = threads) + end + + if !all(x->length(x) <= 1, ranges) + throw(ArgumentError("`leftjoin!` can only be used when each observation in left data set matches at most one observation from right data set")) end new_ends = map(x -> max(1, length(x)), ranges) our_cumsum!(new_ends) total_length = new_ends[end] - inbits = nothing - revised_ends = nothing - ra = nothing - if length(right_range_cols) == 2 - inbits = zeros(Bool, total_length) - # TODO any optimisation is needed for pa? - _fl = identity - _fr = identity - if mapformats[1] - _fl = getformat(dsl, left_range_col) - end - if mapformats[2] - _fr = getformat(dsr, right_range_cols[2]) - end - revised_ends = _mark_lt_part!(inbits, _columns(dsl)[left_range_col], _columns(dsr)[right_range_cols[2]], _fl, _fr, ranges, idx, new_ends, total_length < typemax(Int32) ? Val(Int32) : Val(Int64); strict = strict_inequality[2], threads = threads) - inbit_ranges = Vector{UnitRange{T}}(undef, nrow(dsl)) - ra = Vector{UnitRange{T}}(undef, nrow(dsl)) - @_threadsfor threads for i in 1:length(ranges) - if i == 1 - lo = 1 - lo2 = 1 - else - lo = new_ends[i-1] + 1 - lo2 = revised_ends[i-1] + 1 - end - hi = new_ends[i] - hi2 = revised_ends[i] - ra[i] = lo:hi - inbit_ranges[i] = lo2:hi2 - end - - new_ends = map(x -> max(1, length(x)), inbit_ranges) - our_cumsum!(new_ends) - total_length = new_ends[end] - end - - if !all(x->length(x) <= 1, inbit_ranges) - throw(ArgumentError("`leftjoin!` can only be used when each observation in left data set matches at most one observation from right data set")) - end - if check @assert total_length < 10*nrow(dsl) "the output data set will be very large ($(total_length)×$(ncol(dsl)+length(right_cols))) compared to the left data set size ($(nrow(dsl))×$(ncol(dsl))), make sure that the `on` keyword is selected properly, alternatively, pass `check = false` to ignore this error." end if multiple_match - multiple_match_col = _create_multiple_match_col_left(ranges,revised_ends, total_length) + multiple_match_col = _create_multiple_match_col_left(ranges, total_length) end for j in 1:length(right_cols) - if dsr isa SubDataset - _res = allocatecol(_columns(copy(dsr))[right_cols[j]], total_length, addmissing = true) - else - _res = allocatecol(_columns(dsr)[right_cols[j]], total_length, addmissing = true) - end + _res = allocatecol(_columns(dsr)[right_cols[j]], total_length) if DataAPI.refpool(_res) !== nothing fill_val = DataAPI.invrefpool(_res)[missing] - _fill_right_cols_table_left!(_res.refs, view(DataAPI.refarray(_columns(dsr)[right_cols[j]]), idx), ranges, new_ends, total_length, fill_val;ra=ra, inbits = inbits, en2 = revised_ends, threads = threads) - for x in 1:length(_res) - if !isassigned(_res,x) - _res[x] = missing - end - end + _fill_right_cols_table_left!(_res.refs, view(DataAPI.refarray(_columns(dsr)[right_cols[j]]), idx), ranges, new_ends, total_length, fill_val, threads = threads) else - _fill_right_cols_table_left!(_res, view(_columns(dsr)[right_cols[j]], idx), ranges, new_ends, total_length, missing;ra=ra, inbits = inbits, en2 = revised_ends, threads = threads) + _fill_right_cols_table_left!(_res, view(_columns(dsr)[right_cols[j]], idx), ranges, new_ends, total_length, missing, threads = threads) end push!(_columns(dsl), _res) new_var_name = make_unique([_names(dsl); _names(dsr)[right_cols[j]]], makeunique = makeunique)[end] @@ -857,7 +838,7 @@ function _join_left!(dsl::Dataset, dsr::AbstractDataset, ::Val{T}; onleft, onrig if obs_id[2] obs_id_name2 = Symbol(obs_id_name, "_right") obs_id_right = allocatecol(T, total_length) - _fill_right_cols_table_left!(obs_id_right, idx, ranges, new_ends, total_length, missing;ra=ra, inbits = inbits, en2 = revised_ends, threads = threads) + _fill_right_cols_table_left!(obs_id_right, idx, ranges, new_ends, total_length, missing, threads = threads) insertcols!(dsl, ncol(dsl)+1, obs_id_name2 => obs_id_right, unsupported_copy_cols = false, makeunique = makeunique) end @@ -867,159 +848,7 @@ end function _join_inner(dsl, dsr::AbstractDataset, ::Val{T}; onleft, onright, onright_range = nothing , makeunique = false, mapformats = [true, true], stable = false, alg = HeapSort, check = true, accelerate = false, droprangecols = true, strict_inequality = [false, false], method = :sort, threads = true, onlyreturnrange = false, multiple_match = false, multiple_match_name = :multiple, obs_id = [false, false], obs_id_name = :obs_id) where T (isempty(dsl) || isempty(dsr)) && throw(ArgumentError("in `innerjoin` both left and right tables must be non-empty")) - oncols_left = onleft - oncols_right = onright - type = :both - right_range_cols = Int[] - if onright_range !== nothing - left_range_col = oncols_left[end] - - right_range_cols = index(dsr)[filter!(!isequal(nothing), collect(onright_range))] - if droprangecols - right_cols = setdiff(1:length(index(dsr)), [oncols_right; right_range_cols]) - else - right_cols = setdiff(1:length(index(dsr)), oncols_right) - end - - oncols_right = [oncols_right; first(right_range_cols)] - if onright_range[1] !== nothing - if strict_inequality[1] - type = :leftstrict - else - type = :left - end - else - if strict_inequality[2] - type = :rightstrict - else - type = :right - end - end - else - right_cols = setdiff(1:length(index(dsr)), oncols_right) - end - if !makeunique && !isempty(intersect(_names(dsl), _names(dsr)[right_cols])) - throw(ArgumentError("duplicate column names, pass `makeunique = true` to make them unique using a suffix automatically." )) - end - - nsfpaj = true - # if the columns for inequality like join are PA we cannot use the fast path - if type != :both - if any(i-> DataAPI.refpool(_columns(dsr)[i]) !== nothing, right_range_cols) - nsfpaj = false - end - end - # if (onright_range === nothing || length(onleft) > 1) is false, then we have inequality kind join with no exact match join - if method == :hash && (onright_range === nothing || length(onleft) > 1) - if onright_range !== nothing - ranges, a, idx, minval, reps, sz, right_cols_2 = _find_ranges_for_join_using_hash(dsl, dsr, onleft[1:end-1], oncols_right[1:end-1], mapformats, true, Val(T); threads = threads) - filter!(!=(0), reps) - pushfirst!(reps, 1) - our_cumsum!(reps) - pop!(reps) - grng = GIVENRANGE(idx, reps, Int[], length(reps)) - starts, idx, last_valid_range = _sort_for_join_after_hash(dsr, right_range_cols[1], stable, alg, mapformats, nsfpaj, grng; threads = threads) - _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], length(oncols_left); type = type, nsfpaj = nsfpaj, threads = threads) - else - ranges, a, idx, minval, reps, sz, right_cols = _find_ranges_for_join_using_hash(dsl, dsr, onleft, onright, mapformats, makeunique, Val(T); threads = threads) - end - else - ranges = Vector{UnitRange{T}}(undef, nrow(dsl)) - if length(oncols_left) == 1 && type == :both && nrow(dsr)>1 - success, result = _join_inner_dict(dsl, dsr, ranges, oncols_left, oncols_right, right_cols, Val(T); makeunique = makeunique, mapformats = mapformats, check = check, threads = threads, multiple_match = multiple_match, multiple_match_name = multiple_match_name, obs_id = obs_id, obs_id_name = obs_id_name) - if success - return result - end - end - idx, uniquemode = _find_permute_and_fill_range_for_join!(ranges, dsr, dsl, oncols_right, oncols_left, stable, alg, mapformats, accelerate && (onright_range == nothing || length(oncols_right)>1); nsfpaj = nsfpaj, threads = threads) - - for j in 1:length(oncols_left)-1 - _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], j; nsfpaj = nsfpaj, threads = threads) - end - _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], length(oncols_left); type = type, nsfpaj = nsfpaj, threads = threads) - end - - - - new_ends = map(length, ranges) - our_cumsum!(new_ends) - total_length = new_ends[end] - - inbits = nothing - revised_ends = nothing - if length(right_range_cols) == 2 - inbits = zeros(Bool, total_length) - # TODO any optimisation is needed for pa? - _fl = identity - _fr = identity - if mapformats[1] - _fl = getformat(dsl, left_range_col) - end - if mapformats[2] - _fr = getformat(dsr, right_range_cols[2]) - end - revised_ends = _mark_lt_part!(inbits, _columns(dsl)[left_range_col], _columns(dsr)[right_range_cols[2]], _fl, _fr, ranges, idx, new_ends, total_length < typemax(Int32) ? Val(Int32) : Val(Int64); strict = strict_inequality[2], threads = threads) - end - if length(right_range_cols) == 2 - total_length = sum(inbits) - end - - if onlyreturnrange - return ranges - end - if check - @assert total_length < 10*nrow(dsl) "the output data set will be very large ($(total_length)×$(ncol(dsl)+length(right_cols))) compared to the left data set size ($(nrow(dsl))×$(ncol(dsl))), make sure that the `on` keyword is selected properly, alternatively, pass `check = false` to ignore this error." - end - if multiple_match - multiple_match_col = _create_multiple_match_col_inner(ranges, revised_ends, total_length) - end - - res = [] - for j in 1:length(index(dsl)) - _res = allocatecol(_columns(dsl)[j], total_length, addmissing = false) - if DataAPI.refpool(_res) !== nothing - _fill_oncols_left_table_inner!(_res.refs, DataAPI.refarray(_columns(dsl)[j]), ranges, new_ends, total_length; inbits = inbits, en2 = revised_ends, threads = threads) - else - _fill_oncols_left_table_inner!(_res, _columns(dsl)[j], ranges, new_ends, total_length; inbits = inbits, en2 = revised_ends, threads = threads) - end - push!(res, _res) - end - if dsl isa SubDataset - newds = Dataset(res, copy(index(dsl)), copycols = false) - else - newds = Dataset(res, Index(copy(index(dsl).lookup), copy(index(dsl).names), copy(index(dsl).format)), copycols = false) - end - - for j in 1:length(right_cols) - _res = allocatecol(_columns(dsr)[right_cols[j]], total_length, addmissing = false) - if DataAPI.refpool(_res) !== nothing - _fill_right_cols_table_inner!(_res.refs, view(DataAPI.refarray(_columns(dsr)[right_cols[j]]), idx), ranges, new_ends, total_length; inbits = inbits, en2 = revised_ends, threads = threads) - else - _fill_right_cols_table_inner!(_res, view(_columns(dsr)[right_cols[j]], idx), ranges, new_ends, total_length; inbits = inbits, en2 = revised_ends, threads = threads) - end - push!(_columns(newds), _res) - - new_var_name = make_unique([_names(dsl); _names(dsr)[right_cols[j]]], makeunique = makeunique)[end] - push!(index(newds), new_var_name) - setformat!(newds, index(newds)[new_var_name], getformat(dsr, _names(dsr)[right_cols[j]])) - end - if multiple_match - insertcols!(newds, ncol(newds)+1, multiple_match_name => multiple_match_col, unsupported_copy_cols = false, makeunique = makeunique) - end - if obs_id[1] - obs_id_name1 = Symbol(obs_id_name, "_left") - obs_id_left = allocatecol(nrow(dsl) < typemax(Int32) ? Int32 : Int64, total_length) - _fill_oncols_left_table_inner!(obs_id_left, 1:nrow(dsl), ranges, new_ends, total_length; inbits = inbits, en2 = revised_ends, threads = threads) - insertcols!(newds, ncol(newds)+1, obs_id_name1 => obs_id_left, unsupported_copy_cols = false, makeunique = makeunique) - end - if obs_id[2] - obs_id_name2 = Symbol(obs_id_name, "_right") - obs_id_right = allocatecol(T, total_length) - _fill_right_cols_table_inner!(obs_id_right, idx, ranges, new_ends, total_length; inbits = inbits, en2 = revised_ends, threads = threads) - insertcols!(newds, ncol(newds)+1, obs_id_name2 => obs_id_right, unsupported_copy_cols = false, makeunique = makeunique) - end - newds - + _ranges_join(dsl, dsr, nrow(dsr) < typemax(Int32) ? Val(Int32) : Val(Int64), onleft = onleft, onright = onright, onright_range = onright_range, stable = stable, onlyreturnrange = onlyreturnrange, strict_inequality = strict_inequality, makeunique = makeunique, mapformats = mapformats,accelerate = accelerate, check = check, droprangecols = droprangecols, method = method, threads = threads, multiple_match = multiple_match, multiple_match_name = multiple_match_name, obs_id = obs_id, obs_id_name = obs_id_name, join_type=:inner) end function _in(dsl::AbstractDataset, dsr::AbstractDataset, ::Val{T}; onleft, onright, mapformats = [true, true], stable = false, alg = HeapSort, accelerate = false, threads = true) where T diff --git a/src/join/main.jl b/src/join/main.jl index d3a77136..4c50a689 100644 --- a/src/join/main.jl +++ b/src/join/main.jl @@ -131,7 +131,7 @@ julia> leftjoin(dsl, dsr, on = :year, mapformats = true) # Use formats for datas 4 │ 2012 true missing ``` """ -function DataAPI.leftjoin(dsl::AbstractDataset, dsr::AbstractDataset; on = nothing, makeunique = false, mapformats::Union{Bool, Vector{Bool}} = true, stable = false, alg = HeapSort, check = true, accelerate = false, droprangecols::Bool = true, strict_inequality = false, method::Symbol = :sort, threads::Bool = true, multiple_match::Bool = false, multiple_match_name = :multiple, obs_id::Union{Bool, Vector{Bool}} = false, obs_id_name = :obs_id) +function DataAPI.leftjoin(dsl::AbstractDataset, dsr::AbstractDataset; on = nothing, makeunique = false, mapformats::Union{Bool, Vector{Bool}} = true, stable = false, alg = HeapSort, check = true, accelerate = false, onlyreturnrange = false, droprangecols::Bool = true, strict_inequality = false, method::Symbol = :sort, threads::Bool = true, multiple_match::Bool = false, multiple_match_name = :multiple, obs_id::Union{Bool, Vector{Bool}} = false, obs_id_name = :obs_id) !(method in (:hash, :sort)) && throw(ArgumentError("method must be :hash or :sort")) on === nothing && throw(ArgumentError("`on` keyword must be specified")) if !(on isa AbstractVector) @@ -174,7 +174,7 @@ function DataAPI.leftjoin(dsl::AbstractDataset, dsr::AbstractDataset; on = nothi else throw(ArgumentError("`on` keyword must be a vector of column names or a vector of pairs of column names")) end - _join_left(dsl, dsr, nrow(dsr) < typemax(Int32) ? Val(Int32) : Val(Int64), onleft = onleft, onright = onright, onright_range = onright_range, stable = stable, strict_inequality = strict_inequality, makeunique = makeunique, mapformats = mapformats,accelerate = accelerate, check = false,droprangecols = droprangecols, method = method, threads = threads, multiple_match = multiple_match, multiple_match_name = multiple_match_name, obs_id = obs_id, obs_id_name = obs_id_name) + _join_left(dsl, dsr, nrow(dsr) < typemax(Int32) ? Val(Int32) : Val(Int64), onleft = onleft, onright = onright, onright_range = onright_range, stable = stable, onlyreturnrange = onlyreturnrange, strict_inequality = strict_inequality, makeunique = makeunique, mapformats = mapformats,accelerate = accelerate, check = false,droprangecols = droprangecols, method = method, threads = threads, multiple_match = multiple_match, multiple_match_name = multiple_match_name, obs_id = obs_id, obs_id_name = obs_id_name) end """ From d12033b9f77b87d46cb07b7b8730ca178973aab7 Mon Sep 17 00:00:00 2001 From: kyo227 <43748518+kyo227@users.noreply.github.com> Date: Mon, 17 Oct 2022 14:25:28 +0800 Subject: [PATCH 8/8] Fix some problems --- src/join/join.jl | 110 ++++++++++++++++++++++++----------------------- 1 file changed, 57 insertions(+), 53 deletions(-) diff --git a/src/join/join.jl b/src/join/join.jl index 216ffdea..6d052d23 100644 --- a/src/join/join.jl +++ b/src/join/join.jl @@ -147,20 +147,6 @@ function _fill_val_join!(x, r2, val, inbits, r) end end -function _fill_val_join!(x, r2, val, inbits, r, ranges) - cnt = 1 - lo = r2.start - for i in r - if inbits[i] - x[cnt+lo-1] = val - cnt += 1 - end - if !inbits[i] && length(ranges) === 0 - x[cnt+lo-1] = val - end - end -end - # F1 and F2 are here for type stability when threads = false function _find_ranges_for_join!(ranges, x, y, _fl::F1, _fr::F2, ::Val{T1}, ::Val{T2}; type = :both, threads = true) where {T1, T2, F1, F2} if type == :both @@ -261,7 +247,7 @@ function _fill_oncols_left_table_left!(_res, x, ranges, en, total, fill_val; inb # @show sum(view(inbits, lo:hi)) # sum(view(inbits, lo:hi)) == 0 && continue hi2 = en2[i] - _fill_val_join!(_res, lo2:hi2, x[i], inbits, lo:hi, ranges[i]) + length(ranges[i]) ===0 ? _fill_val_join!(_res, lo2:hi2, x[i]) : _fill_val_join!(_res, lo2:hi2, x[i], inbits, lo:hi) end end @@ -325,7 +311,7 @@ function _fill_right_cols_table_left!(_res, x, ranges, en, total, fill_val; inbi end hi = en[i] hi2 = en2[i] - _fill_right_col_range!(_res, lo2:hi2, x, ranges[i], inbits, lo:hi) + _fill_right_col_range!(_res, lo2:hi2, x, ranges[i], inbits, lo:hi, fill_val) end end end @@ -343,6 +329,23 @@ function _fill_right_col_range!(_res, r2, x, ranges, inbits, r) end end +function _fill_right_col_range!(_res, r2, x, ranges, inbits, r, fill_val) + cnt = 1 + cnt_r = 1 + lo = r2.start + for i in r + if inbits[i] + _res[lo+cnt-1] = x[ranges[cnt_r]] + cnt += 1 + end + if !inbits[i] && length(ranges) === 0 + _res[cnt+lo-1] = fill_val + end + cnt_r += 1 + end +end + + function _fill_right_cols_table_inner!(_res, x, ranges, en, total; inbits = nothing, en2 = nothing, threads = true) if inbits === nothing @_threadsfor threads for i in 1:length(ranges) @@ -467,38 +470,55 @@ ISLE(::Missing, ::Missing) = false function _mark_lt_part!(inbits, x_l, x_r, _fl::F1, _fr::F2, ranges, r_perms, en, ::Val{T}; strict = false, threads = true, join_type) where {T, F1, F2} revised_ends = zeros(T, length(en)) - @_threadsfor threads for i in 1:length(ranges) - if join_type === :left + if join_type === :left + @_threadsfor threads for i in 1:length(ranges) if length(ranges[i]) == 0 revised_ends[i] = 1 continue end - else + i == 1 ? lo = 1 : lo = en[i - 1] + 1 + hi = en[i] + total = 0 + cnt = 1 + for j in ranges[i] + if strict + inbits[lo + cnt - 1] = isless(_fl(x_l[i]), _fr(x_r[r_perms[j]])) + else + inbits[lo + cnt - 1] = ISLE(_fl(x_l[i]), _fr(x_r[r_perms[j]])) + end + total += inbits[lo + cnt - 1] + cnt += 1 + end + revised_ends[i] = total + if total == 0 + ranges[i] = 1:0 + revised_ends[i] += 1 + end + end + else + @_threadsfor threads for i in 1:length(ranges) if length(ranges[i]) == 0 if i !== 1 revised_ends[i] = 0 end continue end - end - i == 1 ? lo = 1 : lo = en[i - 1] + 1 - hi = en[i] - total = 0 - cnt = 1 - for j in ranges[i] - if strict - inbits[lo + cnt - 1] = isless(_fl(x_l[i]), _fr(x_r[r_perms[j]])) - else - inbits[lo + cnt - 1] = ISLE(_fl(x_l[i]), _fr(x_r[r_perms[j]])) + i == 1 ? lo = 1 : lo = en[i - 1] + 1 + hi = en[i] + total = 0 + cnt = 1 + for j in ranges[i] + if strict + inbits[lo + cnt - 1] = isless(_fl(x_l[i]), _fr(x_r[r_perms[j]])) + else + inbits[lo + cnt - 1] = ISLE(_fl(x_l[i]), _fr(x_r[r_perms[j]])) + end + total += inbits[lo + cnt - 1] + cnt += 1 end - total += inbits[lo + cnt - 1] - cnt += 1 - end - revised_ends[i] = total - if total == 0 - ranges[i] = 1:0 - if join_type === :left - revised_ends[i] += 1 + revised_ends[i] = total + if total == 0 + ranges[i] = 1:0 end end end @@ -704,12 +724,6 @@ function _ranges_join(dsl, dsr, ::Val{T}; onleft, onright,onright_range, makeuni if join_type === :left fill_val = DataAPI.invrefpool(_res)[missing] _fill_right_cols_table_left!(_res.refs, view(DataAPI.refarray(_columns(dsr)[right_cols[j]]), idx), ranges, new_ends, total_length, fill_val; inbits = inbits, en2 = revised_ends, threads = threads) - # Solve the #undef problem. There has to be a better way - for i in 1:length(_res) - if !isassigned(_res,i) - _res[i] = missing - end - end else _fill_right_cols_table_inner!(_res.refs, view(DataAPI.refarray(_columns(dsr)[right_cols[j]]), idx), ranges, new_ends, total_length; inbits = inbits, en2 = revised_ends, threads = threads) end @@ -768,30 +782,20 @@ function _join_left!(dsl::Dataset, dsr::AbstractDataset, ::Val{T}; onleft, onrig elseif method == :sort oncols_left = onleft oncols_right = onright - # Inverse selection of the right table right_cols = setdiff(1:length(index(dsr)), oncols_right) - # check unique if !makeunique && !isempty(intersect(_names(dsl), _names(dsr)[right_cols])) throw(ArgumentError("duplicate column names, pass `makeunique = true` to make them unique using a suffix automatically." )) end - # init ranges ranges = Vector{UnitRange{T}}(undef, nrow(dsl)) - # if oncols_left has only one column. if length(oncols_left) == 1 && nrow(dsr)>1 - # if the key in dsr is unique, success is true. - # for row in left, must be one to one. success, result = _join_left!_dict(dsl, dsr, ranges, oncols_left, oncols_right, right_cols, Val(T); makeunique = makeunique, mapformats = mapformats, check = check, threads = threads, multiple_match = multiple_match, multiple_match_name = multiple_match_name, obs_id = obs_id, obs_id_name = obs_id_name) if success return result end end - # idx is the order of dsr by hashed value. - # uniquemode is true, means that no repeat idx, uniquemode = _find_permute_and_fill_range_for_join!(ranges, dsr, dsl, oncols_right, oncols_left, stable, alg, mapformats, accelerate, threads = threads) - # This function will find ranges. if ranges has elements like 2:1, the first number is larger than second - # which means that this row in dsl has no matched row in dsr. for j in 1:length(oncols_left) _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], j, threads = threads) end