Skip to content

Instantly share code, notes, and snippets.

@carstenbauer
Created January 27, 2022 09:48
Show Gist options
  • Select an option

  • Save carstenbauer/9a2f7707350902cb969d82f60c0f1cba to your computer and use it in GitHub Desktop.

Select an option

Save carstenbauer/9a2f7707350902cb969d82f60c0f1cba to your computer and use it in GitHub Desktop.
Attempt to measure core-to-core latency in Julia
using ThreadPinning
using UnicodePlots
using Base.Threads: @threads, nthreads
# copied from ThreadPools.jl
macro tspawnat(thrdid, expr)
letargs = Base._lift_one_interp!(expr)
thunk = esc(:(() -> ($expr)))
var = esc(Base.sync_varname)
tid = esc(thrdid)
quote
if $tid < 1 || $tid > Threads.nthreads()
throw(AssertionError("@tspawnat thread assignment ($($tid)) must be between 1 and Threads.nthreads() (1:$(Threads.nthreads()))"))
end
let $(letargs...)
local task = Task($thunk)
task.sticky = false
ccall(:jl_set_task_tid, Cvoid, (Any, Cint), task, $tid - 1)
if $(Expr(:islocal, var))
put!($var, task)
end
schedule(task)
task
end
end
end
const State = Int
const Preparing = 0
const Ready = 1
const Ping = 2
const Pong = 3
const Finish = 4
Base.@kwdef struct Sync
state::Threads.Atomic{State} = Threads.Atomic{State}(Preparing)
end
state(S::Sync) = S.state[]
function wait_until(S::Sync, expected_state::State)
while state(S) != expected_state
end
return nothing
end
function set(S::Sync, state::State)
S.state[] = state
return nothing
end
function wait_as_long_as(S::Sync, wait_state::State)
loaded_state = state(S)
while loaded_state == wait_state
loaded_state = state(S)
end
return loaded_state
end
function _run_latency_bench(cpu1::Integer, cpu2::Integer; nsamples::Integer = 100, mode::Symbol = :min)
cpu1 == cpu2 && return zero(Float64)
nthreads() >= 2 || @error("Need at least two Julia threads.")
S = Sync()
pinthread(cpu1)
second_thread = @tspawnat 2 begin
pinthread(cpu2)
set(S, Ready)
state = wait_as_long_as(S, Ready)
while state != Finish
if state == Ping
set(S, Pong)
state = wait_as_long_as(S, Pong)
end
end
end
wait_until(S, Ready)
Δts = zeros(typeof(time_ns()), nsamples)
@inbounds for i in 1:nsamples
Δts[i] = begin
t = time_ns()
set(S, Ping)
wait_until(S, Pong)
time_ns() - t
end
end
if mode == :avg
Δt = sum(Float64, Δts) / nsamples
elseif mode == :min || mode == :minimum
Δt = Float64(minimum(Δts))
else
throw(ArgumentError("Unkown mode $mode."))
end
set(S, Finish)
fetch(second_thread)
return Δt
end
function bench_core2core_latency(cpuids = 0:Sys.CPU_THREADS-1; nbench = 5, kwargs...)
# check validity of cpuids input
for c in cpuids
if c < 0 || c > Sys.CPU_THREADS
@error("CPU IDs must all be non-negative and ≤ Sys.CPU_THREADS.")
end
end
# backup current thread affinity
pinning_before = getcpuids()
# run benchmarks
ncpuids = length(cpuids)
latencies = zeros(ncpuids, ncpuids)
for b in 1:nbench
for (j, cpu2) in pairs(cpuids)
for (i, cpu1) in pairs(cpuids)
@inbounds latencies[i, j] += _run_latency_bench(cpu1, cpu2; kwargs...)
end
end
end
latencies ./= nbench
# restore previous thread affinity
pinthreads(pinning_before)
return latencies
end
function core2core_latency(args...; kwargs...)
latencies = bench_core2core_latency(args...; kwargs...)
heatmap(latencies, xoffset = -1, yoffset = -1)
end
# This file is machine-generated - editing it directly is not advised
julia_version = "1.7.1"
manifest_format = "2.0"
[[deps.ArgTools]]
uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
[[deps.Artifacts]]
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
[[deps.Base64]]
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
[[deps.ChainRulesCore]]
deps = ["Compat", "LinearAlgebra", "SparseArrays"]
git-tree-sha1 = "54fc4400de6e5c3e27be6047da2ef6ba355511f8"
uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
version = "1.11.6"
[[deps.ChangesOfVariables]]
deps = ["ChainRulesCore", "LinearAlgebra", "Test"]
git-tree-sha1 = "bf98fa45a0a4cee295de98d4c1462be26345b9a1"
uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
version = "0.1.2"
[[deps.Compat]]
deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
git-tree-sha1 = "44c37b4636bc54afac5c574d2d02b625349d6582"
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
version = "3.41.0"
[[deps.CompilerSupportLibraries_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
[[deps.Contour]]
deps = ["StaticArrays"]
git-tree-sha1 = "9f02045d934dc030edad45944ea80dbd1f0ebea7"
uuid = "d38c429a-6771-53c6-b99e-75d170b6e991"
version = "0.5.7"
[[deps.Crayons]]
git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15"
uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
version = "4.1.1"
[[deps.DataAPI]]
git-tree-sha1 = "cc70b17275652eb47bc9e5f81635981f13cea5c8"
uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
version = "1.9.0"
[[deps.DataStructures]]
deps = ["Compat", "InteractiveUtils", "OrderedCollections"]
git-tree-sha1 = "3daef5523dd2e769dad2365274f760ff5f282c7d"
uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
version = "0.18.11"
[[deps.Dates]]
deps = ["Printf"]
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
[[deps.DelimitedFiles]]
deps = ["Mmap"]
uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
[[deps.Distributed]]
deps = ["Random", "Serialization", "Sockets"]
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
[[deps.DocStringExtensions]]
deps = ["LibGit2"]
git-tree-sha1 = "b19534d1895d702889b219c382a6e18010797f0b"
uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
version = "0.8.6"
[[deps.Downloads]]
deps = ["ArgTools", "LibCURL", "NetworkOptions"]
uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
[[deps.InteractiveUtils]]
deps = ["Markdown"]
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
[[deps.InverseFunctions]]
deps = ["Test"]
git-tree-sha1 = "a7254c0acd8e62f1ac75ad24d5db43f5f19f3c65"
uuid = "3587e190-3f89-42d0-90ee-14403ec27112"
version = "0.1.2"
[[deps.IrrationalConstants]]
git-tree-sha1 = "7fd44fd4ff43fc60815f8e764c0f352b83c49151"
uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
version = "0.1.1"
[[deps.LibCURL]]
deps = ["LibCURL_jll", "MozillaCACerts_jll"]
uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
[[deps.LibCURL_jll]]
deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
[[deps.LibGit2]]
deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
[[deps.LibSSH2_jll]]
deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
[[deps.Libdl]]
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
[[deps.LinearAlgebra]]
deps = ["Libdl", "libblastrampoline_jll"]
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
[[deps.LogExpFunctions]]
deps = ["ChainRulesCore", "ChangesOfVariables", "DocStringExtensions", "InverseFunctions", "IrrationalConstants", "LinearAlgebra"]
git-tree-sha1 = "e5718a00af0ab9756305a0392832c8952c7426c1"
uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
version = "0.3.6"
[[deps.Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
[[deps.Markdown]]
deps = ["Base64"]
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
[[deps.MbedTLS_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
[[deps.Missings]]
deps = ["DataAPI"]
git-tree-sha1 = "bf210ce90b6c9eed32d25dbcae1ebc565df2687f"
uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
version = "1.0.2"
[[deps.Mmap]]
uuid = "a63ad114-7e13-5084-954f-fe012c677804"
[[deps.MozillaCACerts_jll]]
uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
[[deps.NetworkOptions]]
uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
[[deps.OpenBLAS_jll]]
deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"]
uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
[[deps.OrderedCollections]]
git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c"
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
version = "1.4.1"
[[deps.Pkg]]
deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
[[deps.Printf]]
deps = ["Unicode"]
uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
[[deps.REPL]]
deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
[[deps.Random]]
deps = ["SHA", "Serialization"]
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
[[deps.Requires]]
deps = ["UUIDs"]
git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7"
uuid = "ae029012-a4dd-5104-9daa-d747884805df"
version = "1.3.0"
[[deps.SHA]]
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
[[deps.Serialization]]
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
[[deps.SharedArrays]]
deps = ["Distributed", "Mmap", "Random", "Serialization"]
uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
[[deps.Sockets]]
uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
[[deps.SortingAlgorithms]]
deps = ["DataStructures"]
git-tree-sha1 = "b3363d7460f7d098ca0912c69b082f75625d7508"
uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c"
version = "1.0.1"
[[deps.SparseArrays]]
deps = ["LinearAlgebra", "Random"]
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
[[deps.StaticArrays]]
deps = ["LinearAlgebra", "Random", "Statistics"]
git-tree-sha1 = "2884859916598f974858ff01df7dfc6c708dd895"
uuid = "90137ffa-7385-5640-81b9-e52037218182"
version = "1.3.3"
[[deps.Statistics]]
deps = ["LinearAlgebra", "SparseArrays"]
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
[[deps.StatsAPI]]
git-tree-sha1 = "d88665adc9bcf45903013af0982e2fd05ae3d0a6"
uuid = "82ae8749-77ed-4fe6-ae5f-f523153014b0"
version = "1.2.0"
[[deps.StatsBase]]
deps = ["DataAPI", "DataStructures", "LinearAlgebra", "LogExpFunctions", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics", "StatsAPI"]
git-tree-sha1 = "51383f2d367eb3b444c961d485c565e4c0cf4ba0"
uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
version = "0.33.14"
[[deps.TOML]]
deps = ["Dates"]
uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
[[deps.Tar]]
deps = ["ArgTools", "SHA"]
uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
[[deps.Test]]
deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[[deps.ThreadPinning]]
deps = ["Libdl", "LinearAlgebra", "Random", "Requires"]
git-tree-sha1 = "f999f26862999bf23935097c2570fd4192db80d6"
uuid = "811555cd-349b-4f26-b7bc-1f208b848042"
version = "0.3.0"
[[deps.UUIDs]]
deps = ["Random", "SHA"]
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
[[deps.Unicode]]
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
[[deps.UnicodePlots]]
deps = ["Contour", "Crayons", "Dates", "SparseArrays", "StatsBase"]
git-tree-sha1 = "62595983da672758a96f89e07f7fd3735f16c18c"
uuid = "b8865327-cd53-5732-bb35-84acbb429228"
version = "2.7.0"
[[deps.Zlib_jll]]
deps = ["Libdl"]
uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
[[deps.libblastrampoline_jll]]
deps = ["Artifacts", "Libdl", "OpenBLAS_jll"]
uuid = "8e850b90-86db-534c-a0d3-1478176c7d93"
[[deps.nghttp2_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
[[deps.p7zip_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
[deps]
ThreadPinning = "811555cd-349b-4f26-b7bc-1f208b848042"
UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228"
@carstenbauer
Copy link
Author

carstenbauer commented Jan 27, 2022

Running c2clat on the same system I find the following. Note that I multiplied the resulting latencies by two (since in the Julia variant we are measuring a full roundtrip between cores instead of just one way).

Screenshot 2022-01-27 at 11 12 24

julia> M
40×40 Matrix{Float64}:
   0.0  202.0  200.0  192.0  206.0  202.0  202.0  196.0  190.0  192.0    486.0  494.0  484.0  486.0  492.0  508.0  488.0  492.0  494.0  498.0
 202.0    0.0  200.0  188.0  204.0  200.0  194.0  190.0  188.0  190.0     490.0  492.0  484.0  482.0  498.0  500.0  488.0  490.0  492.0  492.0
 200.0  200.0    0.0  204.0  198.0  202.0  202.0  204.0  196.0  214.0     496.0  498.0  490.0  488.0  502.0  512.0  496.0  494.0  496.0  502.0
 192.0  188.0  204.0    0.0  216.0  192.0  186.0  204.0  198.0  218.0     490.0  490.0  480.0  484.0  496.0  490.0  484.0  486.0  496.0  492.0
 206.0  204.0  198.0  216.0    0.0  212.0  206.0  196.0  220.0  204.0     502.0  504.0  496.0  494.0  508.0  510.0  502.0  502.0  504.0  506.0
 202.0  200.0  202.0  192.0  212.0    0.0  200.0  198.0  204.0  190.0    498.0  496.0  488.0  492.0  500.0  506.0  496.0  498.0  496.0  500.0
 202.0  194.0  202.0  186.0  206.0  200.0    0.0  188.0  192.0  196.0     506.0  506.0  496.0  496.0  504.0  518.0  506.0  502.0  502.0  502.0
 196.0  190.0  204.0  204.0  196.0  198.0  188.0    0.0  214.0  200.0     494.0  490.0  486.0  482.0  498.0  496.0  492.0  492.0  496.0  496.0
 190.0  188.0  196.0  198.0  220.0  204.0  192.0  214.0    0.0  222.0     496.0  498.0  486.0  492.0  504.0  500.0  496.0  494.0  502.0  500.0
 192.0  190.0  214.0  218.0  204.0  190.0  196.0  200.0  222.0    0.0     500.0  498.0  492.0  492.0  506.0  500.0  496.0  496.0  498.0  502.0
                                                                                                                                    
 486.0  490.0  496.0  490.0  502.0  498.0  506.0  494.0  496.0  500.0      0.0  190.0  186.0  186.0  194.0  198.0  190.0  186.0  192.0  212.0
 494.0  492.0  498.0  490.0  504.0  496.0  506.0  490.0  498.0  498.0     190.0    0.0  200.0  186.0  210.0  198.0  188.0  186.0  192.0  196.0
 484.0  484.0  490.0  480.0  496.0  488.0  496.0  486.0  486.0  492.0     186.0  200.0    0.0  204.0  198.0  200.0  188.0  186.0  196.0  214.0
 486.0  482.0  488.0  484.0  494.0  492.0  496.0  482.0  492.0  492.0     186.0  186.0  204.0    0.0  214.0  186.0  186.0  204.0  196.0  200.0
 492.0  498.0  502.0  496.0  508.0  500.0  504.0  498.0  504.0  506.0     194.0  210.0  198.0  214.0    0.0  208.0  196.0  212.0  220.0  224.0
 508.0  500.0  512.0  490.0  510.0  506.0  518.0  496.0  500.0  500.0    198.0  198.0  200.0  186.0  208.0    0.0  198.0  186.0  190.0  208.0
 488.0  488.0  496.0  484.0  502.0  496.0  506.0  492.0  496.0  496.0     190.0  188.0  188.0  186.0  196.0  198.0    0.0  202.0  210.0  198.0
 492.0  490.0  494.0  486.0  502.0  498.0  502.0  492.0  494.0  496.0     186.0  186.0  186.0  204.0  212.0  186.0  202.0    0.0  210.0  200.0
 494.0  492.0  496.0  496.0  504.0  496.0  502.0  496.0  502.0  498.0     192.0  192.0  196.0  196.0  220.0  190.0  210.0  210.0    0.0  222.0
 498.0  492.0  502.0  492.0  506.0  500.0  502.0  496.0  500.0  502.0     212.0  196.0  214.0  200.0  224.0  208.0  198.0  200.0  222.0    0.0

The differences between intra- and intersocket are more pronounced. Intrasocket appears to be only slightly faster but intersocket turns out to be much slower compared to the Julia findings. But I take it the results are in the same "ballpark".

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment