Routine: get rid of ::SetUp()

Since we now use C++ exceptions inside the implementation (and exceptions
can be thrown from constructors), there is no need for a separate
Routine::SetUp() function.

For this, we also change the way how the kernel source string is constructed.
The kernel-specific source code is now passed to the Routine ctor via
an initializer_list of C strings to avoid unnecessary data copying
while also working around C1091 of MSVC 2013.
pull/117/head
Ivan Shapovalov 2016-10-18 04:53:06 +03:00
parent b98af44fcf
commit 56f300607b
22 changed files with 104 additions and 174 deletions

View File

@ -52,7 +52,6 @@ def clblast_cc(routine):
result += " try {" + NL
result += " auto queue_cpp = Queue(*queue);" + NL
result += " auto routine = X" + routine.name + "<" + routine.template.template + ">(queue_cpp, event);" + NL
result += " routine.SetUp();" + NL
result += " routine.Do" + routine.name.capitalize() + "("
result += ("," + NL + indent1).join([a for a in routine.arguments_clcudaapi()])
result += ");" + NL

View File

@ -171,7 +171,6 @@ StatusCode Swap(const size_t n,
try {
auto queue_cpp = Queue(*queue);
auto routine = Xswap<T>(queue_cpp, event);
routine.SetUp();
routine.DoSwap(n,
Buffer<T>(x_buffer), x_offset, x_inc,
Buffer<T>(y_buffer), y_offset, y_inc);
@ -208,7 +207,6 @@ StatusCode Scal(const size_t n,
try {
auto queue_cpp = Queue(*queue);
auto routine = Xscal<T>(queue_cpp, event);
routine.SetUp();
routine.DoScal(n,
alpha,
Buffer<T>(x_buffer), x_offset, x_inc);
@ -245,7 +243,6 @@ StatusCode Copy(const size_t n,
try {
auto queue_cpp = Queue(*queue);
auto routine = Xcopy<T>(queue_cpp, event);
routine.SetUp();
routine.DoCopy(n,
Buffer<T>(x_buffer), x_offset, x_inc,
Buffer<T>(y_buffer), y_offset, y_inc);
@ -283,7 +280,6 @@ StatusCode Axpy(const size_t n,
try {
auto queue_cpp = Queue(*queue);
auto routine = Xaxpy<T>(queue_cpp, event);
routine.SetUp();
routine.DoAxpy(n,
alpha,
Buffer<T>(x_buffer), x_offset, x_inc,
@ -327,7 +323,6 @@ StatusCode Dot(const size_t n,
try {
auto queue_cpp = Queue(*queue);
auto routine = Xdot<T>(queue_cpp, event);
routine.SetUp();
routine.DoDot(n,
Buffer<T>(dot_buffer), dot_offset,
Buffer<T>(x_buffer), x_offset, x_inc,
@ -361,7 +356,6 @@ StatusCode Dotu(const size_t n,
try {
auto queue_cpp = Queue(*queue);
auto routine = Xdotu<T>(queue_cpp, event);
routine.SetUp();
routine.DoDotu(n,
Buffer<T>(dot_buffer), dot_offset,
Buffer<T>(x_buffer), x_offset, x_inc,
@ -390,7 +384,6 @@ StatusCode Dotc(const size_t n,
try {
auto queue_cpp = Queue(*queue);
auto routine = Xdotc<T>(queue_cpp, event);
routine.SetUp();
routine.DoDotc(n,
Buffer<T>(dot_buffer), dot_offset,
Buffer<T>(x_buffer), x_offset, x_inc,
@ -418,7 +411,6 @@ StatusCode Nrm2(const size_t n,
try {
auto queue_cpp = Queue(*queue);
auto routine = Xnrm2<T>(queue_cpp, event);
routine.SetUp();
routine.DoNrm2(n,
Buffer<T>(nrm2_buffer), nrm2_offset,
Buffer<T>(x_buffer), x_offset, x_inc);
@ -455,7 +447,6 @@ StatusCode Asum(const size_t n,
try {
auto queue_cpp = Queue(*queue);
auto routine = Xasum<T>(queue_cpp, event);
routine.SetUp();
routine.DoAsum(n,
Buffer<T>(asum_buffer), asum_offset,
Buffer<T>(x_buffer), x_offset, x_inc);
@ -492,7 +483,6 @@ StatusCode Sum(const size_t n,
try {
auto queue_cpp = Queue(*queue);
auto routine = Xsum<T>(queue_cpp, event);
routine.SetUp();
routine.DoSum(n,
Buffer<T>(sum_buffer), sum_offset,
Buffer<T>(x_buffer), x_offset, x_inc);
@ -529,7 +519,6 @@ StatusCode Amax(const size_t n,
try {
auto queue_cpp = Queue(*queue);
auto routine = Xamax<T>(queue_cpp, event);
routine.SetUp();
routine.DoAmax(n,
Buffer<unsigned int>(imax_buffer), imax_offset,
Buffer<T>(x_buffer), x_offset, x_inc);
@ -566,7 +555,6 @@ StatusCode Max(const size_t n,
try {
auto queue_cpp = Queue(*queue);
auto routine = Xmax<T>(queue_cpp, event);
routine.SetUp();
routine.DoMax(n,
Buffer<unsigned int>(imax_buffer), imax_offset,
Buffer<T>(x_buffer), x_offset, x_inc);
@ -603,7 +591,6 @@ StatusCode Min(const size_t n,
try {
auto queue_cpp = Queue(*queue);
auto routine = Xmin<T>(queue_cpp, event);
routine.SetUp();
routine.DoMin(n,
Buffer<unsigned int>(imin_buffer), imin_offset,
Buffer<T>(x_buffer), x_offset, x_inc);
@ -648,7 +635,6 @@ StatusCode Gemv(const Layout layout, const Transpose a_transpose,
try {
auto queue_cpp = Queue(*queue);
auto routine = Xgemv<T>(queue_cpp, event);
routine.SetUp();
routine.DoGemv(layout, a_transpose,
m, n,
alpha,
@ -713,7 +699,6 @@ StatusCode Gbmv(const Layout layout, const Transpose a_transpose,
try {
auto queue_cpp = Queue(*queue);
auto routine = Xgbmv<T>(queue_cpp, event);
routine.SetUp();
routine.DoGbmv(layout, a_transpose,
m, n, kl, ku,
alpha,
@ -778,7 +763,6 @@ StatusCode Hemv(const Layout layout, const Triangle triangle,
try {
auto queue_cpp = Queue(*queue);
auto routine = Xhemv<T>(queue_cpp, event);
routine.SetUp();
routine.DoHemv(layout, triangle,
n,
alpha,
@ -819,7 +803,6 @@ StatusCode Hbmv(const Layout layout, const Triangle triangle,
try {
auto queue_cpp = Queue(*queue);
auto routine = Xhbmv<T>(queue_cpp, event);
routine.SetUp();
routine.DoHbmv(layout, triangle,
n, k,
alpha,
@ -860,7 +843,6 @@ StatusCode Hpmv(const Layout layout, const Triangle triangle,
try {
auto queue_cpp = Queue(*queue);
auto routine = Xhpmv<T>(queue_cpp, event);
routine.SetUp();
routine.DoHpmv(layout, triangle,
n,
alpha,
@ -901,7 +883,6 @@ StatusCode Symv(const Layout layout, const Triangle triangle,
try {
auto queue_cpp = Queue(*queue);
auto routine = Xsymv<T>(queue_cpp, event);
routine.SetUp();
routine.DoSymv(layout, triangle,
n,
alpha,
@ -950,7 +931,6 @@ StatusCode Sbmv(const Layout layout, const Triangle triangle,
try {
auto queue_cpp = Queue(*queue);
auto routine = Xsbmv<T>(queue_cpp, event);
routine.SetUp();
routine.DoSbmv(layout, triangle,
n, k,
alpha,
@ -999,7 +979,6 @@ StatusCode Spmv(const Layout layout, const Triangle triangle,
try {
auto queue_cpp = Queue(*queue);
auto routine = Xspmv<T>(queue_cpp, event);
routine.SetUp();
routine.DoSpmv(layout, triangle,
n,
alpha,
@ -1045,7 +1024,6 @@ StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_
try {
auto queue_cpp = Queue(*queue);
auto routine = Xtrmv<T>(queue_cpp, event);
routine.SetUp();
routine.DoTrmv(layout, triangle, a_transpose, diagonal,
n,
Buffer<T>(a_buffer), a_offset, a_ld,
@ -1089,7 +1067,6 @@ StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_
try {
auto queue_cpp = Queue(*queue);
auto routine = Xtbmv<T>(queue_cpp, event);
routine.SetUp();
routine.DoTbmv(layout, triangle, a_transpose, diagonal,
n, k,
Buffer<T>(a_buffer), a_offset, a_ld,
@ -1133,7 +1110,6 @@ StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_
try {
auto queue_cpp = Queue(*queue);
auto routine = Xtpmv<T>(queue_cpp, event);
routine.SetUp();
routine.DoTpmv(layout, triangle, a_transpose, diagonal,
n,
Buffer<T>(ap_buffer), ap_offset,
@ -1269,7 +1245,6 @@ StatusCode Ger(const Layout layout,
try {
auto queue_cpp = Queue(*queue);
auto routine = Xger<T>(queue_cpp, event);
routine.SetUp();
routine.DoGer(layout,
m, n,
alpha,
@ -1313,7 +1288,6 @@ StatusCode Geru(const Layout layout,
try {
auto queue_cpp = Queue(*queue);
auto routine = Xgeru<T>(queue_cpp, event);
routine.SetUp();
routine.DoGeru(layout,
m, n,
alpha,
@ -1350,7 +1324,6 @@ StatusCode Gerc(const Layout layout,
try {
auto queue_cpp = Queue(*queue);
auto routine = Xgerc<T>(queue_cpp, event);
routine.SetUp();
routine.DoGerc(layout,
m, n,
alpha,
@ -1386,7 +1359,6 @@ StatusCode Her(const Layout layout, const Triangle triangle,
try {
auto queue_cpp = Queue(*queue);
auto routine = Xher<std::complex<T>,T>(queue_cpp, event);
routine.SetUp();
routine.DoHer(layout, triangle,
n,
alpha,
@ -1419,7 +1391,6 @@ StatusCode Hpr(const Layout layout, const Triangle triangle,
try {
auto queue_cpp = Queue(*queue);
auto routine = Xhpr<std::complex<T>,T>(queue_cpp, event);
routine.SetUp();
routine.DoHpr(layout, triangle,
n,
alpha,
@ -1453,7 +1424,6 @@ StatusCode Her2(const Layout layout, const Triangle triangle,
try {
auto queue_cpp = Queue(*queue);
auto routine = Xher2<T>(queue_cpp, event);
routine.SetUp();
routine.DoHer2(layout, triangle,
n,
alpha,
@ -1490,7 +1460,6 @@ StatusCode Hpr2(const Layout layout, const Triangle triangle,
try {
auto queue_cpp = Queue(*queue);
auto routine = Xhpr2<T>(queue_cpp, event);
routine.SetUp();
routine.DoHpr2(layout, triangle,
n,
alpha,
@ -1526,7 +1495,6 @@ StatusCode Syr(const Layout layout, const Triangle triangle,
try {
auto queue_cpp = Queue(*queue);
auto routine = Xsyr<T>(queue_cpp, event);
routine.SetUp();
routine.DoSyr(layout, triangle,
n,
alpha,
@ -1565,7 +1533,6 @@ StatusCode Spr(const Layout layout, const Triangle triangle,
try {
auto queue_cpp = Queue(*queue);
auto routine = Xspr<T>(queue_cpp, event);
routine.SetUp();
routine.DoSpr(layout, triangle,
n,
alpha,
@ -1605,7 +1572,6 @@ StatusCode Syr2(const Layout layout, const Triangle triangle,
try {
auto queue_cpp = Queue(*queue);
auto routine = Xsyr2<T>(queue_cpp, event);
routine.SetUp();
routine.DoSyr2(layout, triangle,
n,
alpha,
@ -1649,7 +1615,6 @@ StatusCode Spr2(const Layout layout, const Triangle triangle,
try {
auto queue_cpp = Queue(*queue);
auto routine = Xspr2<T>(queue_cpp, event);
routine.SetUp();
routine.DoSpr2(layout, triangle,
n,
alpha,
@ -1698,7 +1663,6 @@ StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpos
try {
auto queue_cpp = Queue(*queue);
auto routine = Xgemm<T>(queue_cpp, event);
routine.SetUp();
routine.DoGemm(layout, a_transpose, b_transpose,
m, n, k,
alpha,
@ -1763,7 +1727,6 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
try {
auto queue_cpp = Queue(*queue);
auto routine = Xsymm<T>(queue_cpp, event);
routine.SetUp();
routine.DoSymm(layout, side, triangle,
m, n,
alpha,
@ -1828,7 +1791,6 @@ StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle,
try {
auto queue_cpp = Queue(*queue);
auto routine = Xhemm<T>(queue_cpp, event);
routine.SetUp();
routine.DoHemm(layout, side, triangle,
m, n,
alpha,
@ -1868,7 +1830,6 @@ StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_
try {
auto queue_cpp = Queue(*queue);
auto routine = Xsyrk<T>(queue_cpp, event);
routine.SetUp();
routine.DoSyrk(layout, triangle, a_transpose,
n, k,
alpha,
@ -1926,7 +1887,6 @@ StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_
try {
auto queue_cpp = Queue(*queue);
auto routine = Xherk<std::complex<T>,T>(queue_cpp, event);
routine.SetUp();
routine.DoHerk(layout, triangle, a_transpose,
n, k,
alpha,
@ -1964,7 +1924,6 @@ StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose a
try {
auto queue_cpp = Queue(*queue);
auto routine = Xsyr2k<T>(queue_cpp, event);
routine.SetUp();
routine.DoSyr2k(layout, triangle, ab_transpose,
n, k,
alpha,
@ -2029,7 +1988,6 @@ StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose a
try {
auto queue_cpp = Queue(*queue);
auto routine = Xher2k<T,U>(queue_cpp, event);
routine.SetUp();
routine.DoHer2k(layout, triangle, ab_transpose,
n, k,
alpha,
@ -2068,7 +2026,6 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, c
try {
auto queue_cpp = Queue(*queue);
auto routine = Xtrmm<T>(queue_cpp, event);
routine.SetUp();
routine.DoTrmm(layout, side, triangle, a_transpose, diagonal,
m, n,
alpha,
@ -2164,7 +2121,6 @@ StatusCode Omatcopy(const Layout layout, const Transpose a_transpose,
try {
auto queue_cpp = Queue(*queue);
auto routine = Xomatcopy<T>(queue_cpp, event);
routine.SetUp();
routine.DoOmatcopy(layout, a_transpose,
m, n,
alpha,
@ -2225,57 +2181,57 @@ StatusCode FillCache(const cl_device_id device) {
auto queue = Queue(context, device_cpp);
// Runs all the level 1 set-up functions
Xswap<float>(queue, nullptr).SetUp(); Xswap<double>(queue, nullptr).SetUp(); Xswap<float2>(queue, nullptr).SetUp(); Xswap<double2>(queue, nullptr).SetUp();
Xswap<float>(queue, nullptr).SetUp(); Xswap<double>(queue, nullptr).SetUp(); Xswap<float2>(queue, nullptr).SetUp(); Xswap<double2>(queue, nullptr).SetUp();
Xscal<float>(queue, nullptr).SetUp(); Xscal<double>(queue, nullptr).SetUp(); Xscal<float2>(queue, nullptr).SetUp(); Xscal<double2>(queue, nullptr).SetUp();
Xcopy<float>(queue, nullptr).SetUp(); Xcopy<double>(queue, nullptr).SetUp(); Xcopy<float2>(queue, nullptr).SetUp(); Xcopy<double2>(queue, nullptr).SetUp();
Xaxpy<float>(queue, nullptr).SetUp(); Xaxpy<double>(queue, nullptr).SetUp(); Xaxpy<float2>(queue, nullptr).SetUp(); Xaxpy<double2>(queue, nullptr).SetUp();
Xdot<float>(queue, nullptr).SetUp(); Xdot<double>(queue, nullptr).SetUp();
Xdotu<float2>(queue, nullptr).SetUp(); Xdotu<double2>(queue, nullptr).SetUp();
Xdotc<float2>(queue, nullptr).SetUp(); Xdotc<double2>(queue, nullptr).SetUp();
Xnrm2<float>(queue, nullptr).SetUp(); Xnrm2<double>(queue, nullptr).SetUp(); Xnrm2<float2>(queue, nullptr).SetUp(); Xnrm2<double2>(queue, nullptr).SetUp();
Xasum<float>(queue, nullptr).SetUp(); Xasum<double>(queue, nullptr).SetUp(); Xasum<float2>(queue, nullptr).SetUp(); Xasum<double2>(queue, nullptr).SetUp();
Xsum<float>(queue, nullptr).SetUp(); Xsum<double>(queue, nullptr).SetUp(); Xsum<float2>(queue, nullptr).SetUp(); Xsum<double2>(queue, nullptr).SetUp();
Xamax<float>(queue, nullptr).SetUp(); Xamax<double>(queue, nullptr).SetUp(); Xamax<float2>(queue, nullptr).SetUp(); Xamax<double2>(queue, nullptr).SetUp();
Xmax<float>(queue, nullptr).SetUp(); Xmax<double>(queue, nullptr).SetUp(); Xmax<float2>(queue, nullptr).SetUp(); Xmax<double2>(queue, nullptr).SetUp();
Xmin<float>(queue, nullptr).SetUp(); Xmin<double>(queue, nullptr).SetUp(); Xmin<float2>(queue, nullptr).SetUp(); Xmin<double2>(queue, nullptr).SetUp();
Xswap<float>(queue, nullptr); Xswap<double>(queue, nullptr); Xswap<float2>(queue, nullptr); Xswap<double2>(queue, nullptr);
Xswap<float>(queue, nullptr); Xswap<double>(queue, nullptr); Xswap<float2>(queue, nullptr); Xswap<double2>(queue, nullptr);
Xscal<float>(queue, nullptr); Xscal<double>(queue, nullptr); Xscal<float2>(queue, nullptr); Xscal<double2>(queue, nullptr);
Xcopy<float>(queue, nullptr); Xcopy<double>(queue, nullptr); Xcopy<float2>(queue, nullptr); Xcopy<double2>(queue, nullptr);
Xaxpy<float>(queue, nullptr); Xaxpy<double>(queue, nullptr); Xaxpy<float2>(queue, nullptr); Xaxpy<double2>(queue, nullptr);
Xdot<float>(queue, nullptr); Xdot<double>(queue, nullptr);
Xdotu<float2>(queue, nullptr); Xdotu<double2>(queue, nullptr);
Xdotc<float2>(queue, nullptr); Xdotc<double2>(queue, nullptr);
Xnrm2<float>(queue, nullptr); Xnrm2<double>(queue, nullptr); Xnrm2<float2>(queue, nullptr); Xnrm2<double2>(queue, nullptr);
Xasum<float>(queue, nullptr); Xasum<double>(queue, nullptr); Xasum<float2>(queue, nullptr); Xasum<double2>(queue, nullptr);
Xsum<float>(queue, nullptr); Xsum<double>(queue, nullptr); Xsum<float2>(queue, nullptr); Xsum<double2>(queue, nullptr);
Xamax<float>(queue, nullptr); Xamax<double>(queue, nullptr); Xamax<float2>(queue, nullptr); Xamax<double2>(queue, nullptr);
Xmax<float>(queue, nullptr); Xmax<double>(queue, nullptr); Xmax<float2>(queue, nullptr); Xmax<double2>(queue, nullptr);
Xmin<float>(queue, nullptr); Xmin<double>(queue, nullptr); Xmin<float2>(queue, nullptr); Xmin<double2>(queue, nullptr);
// Runs all the level 2 set-up functions
Xgemv<float>(queue, nullptr).SetUp(); Xgemv<double>(queue, nullptr).SetUp(); Xgemv<float2>(queue, nullptr).SetUp(); Xgemv<double2>(queue, nullptr).SetUp();
Xgbmv<float>(queue, nullptr).SetUp(); Xgbmv<double>(queue, nullptr).SetUp(); Xgbmv<float2>(queue, nullptr).SetUp(); Xgbmv<double2>(queue, nullptr).SetUp();
Xhemv<float2>(queue, nullptr).SetUp(); Xhemv<double2>(queue, nullptr).SetUp();
Xhbmv<float2>(queue, nullptr).SetUp(); Xhbmv<double2>(queue, nullptr).SetUp();
Xhpmv<float2>(queue, nullptr).SetUp(); Xhpmv<double2>(queue, nullptr).SetUp();
Xsymv<float>(queue, nullptr).SetUp(); Xsymv<double>(queue, nullptr).SetUp();
Xsbmv<float>(queue, nullptr).SetUp(); Xsbmv<double>(queue, nullptr).SetUp();
Xspmv<float>(queue, nullptr).SetUp(); Xspmv<double>(queue, nullptr).SetUp();
Xtrmv<float>(queue, nullptr).SetUp(); Xtrmv<double>(queue, nullptr).SetUp(); Xtrmv<float2>(queue, nullptr).SetUp(); Xtrmv<double2>(queue, nullptr).SetUp();
Xtbmv<float>(queue, nullptr).SetUp(); Xtbmv<double>(queue, nullptr).SetUp(); Xtbmv<float2>(queue, nullptr).SetUp(); Xtbmv<double2>(queue, nullptr).SetUp();
Xtpmv<float>(queue, nullptr).SetUp(); Xtpmv<double>(queue, nullptr).SetUp(); Xtpmv<float2>(queue, nullptr).SetUp(); Xtpmv<double2>(queue, nullptr).SetUp();
Xger<float>(queue, nullptr).SetUp(); Xger<double>(queue, nullptr).SetUp();
Xgeru<float2>(queue, nullptr).SetUp(); Xgeru<double2>(queue, nullptr).SetUp();
Xgerc<float2>(queue, nullptr).SetUp(); Xgerc<double2>(queue, nullptr).SetUp();
Xher<float2,float>(queue, nullptr).SetUp(); Xher<double2,double>(queue, nullptr).SetUp();
Xhpr<float2,float>(queue, nullptr).SetUp(); Xhpr<double2,double>(queue, nullptr).SetUp();
Xher2<float2>(queue, nullptr).SetUp(); Xher2<double2>(queue, nullptr).SetUp();
Xhpr2<float2>(queue, nullptr).SetUp(); Xhpr2<double2>(queue, nullptr).SetUp();
Xsyr<float>(queue, nullptr).SetUp(); Xsyr<double>(queue, nullptr).SetUp();
Xspr<float>(queue, nullptr).SetUp(); Xspr<double>(queue, nullptr).SetUp();
Xsyr2<float>(queue, nullptr).SetUp(); Xsyr2<double>(queue, nullptr).SetUp();
Xspr2<float>(queue, nullptr).SetUp(); Xspr2<double>(queue, nullptr).SetUp();
Xgemv<float>(queue, nullptr); Xgemv<double>(queue, nullptr); Xgemv<float2>(queue, nullptr); Xgemv<double2>(queue, nullptr);
Xgbmv<float>(queue, nullptr); Xgbmv<double>(queue, nullptr); Xgbmv<float2>(queue, nullptr); Xgbmv<double2>(queue, nullptr);
Xhemv<float2>(queue, nullptr); Xhemv<double2>(queue, nullptr);
Xhbmv<float2>(queue, nullptr); Xhbmv<double2>(queue, nullptr);
Xhpmv<float2>(queue, nullptr); Xhpmv<double2>(queue, nullptr);
Xsymv<float>(queue, nullptr); Xsymv<double>(queue, nullptr);
Xsbmv<float>(queue, nullptr); Xsbmv<double>(queue, nullptr);
Xspmv<float>(queue, nullptr); Xspmv<double>(queue, nullptr);
Xtrmv<float>(queue, nullptr); Xtrmv<double>(queue, nullptr); Xtrmv<float2>(queue, nullptr); Xtrmv<double2>(queue, nullptr);
Xtbmv<float>(queue, nullptr); Xtbmv<double>(queue, nullptr); Xtbmv<float2>(queue, nullptr); Xtbmv<double2>(queue, nullptr);
Xtpmv<float>(queue, nullptr); Xtpmv<double>(queue, nullptr); Xtpmv<float2>(queue, nullptr); Xtpmv<double2>(queue, nullptr);
Xger<float>(queue, nullptr); Xger<double>(queue, nullptr);
Xgeru<float2>(queue, nullptr); Xgeru<double2>(queue, nullptr);
Xgerc<float2>(queue, nullptr); Xgerc<double2>(queue, nullptr);
Xher<float2,float>(queue, nullptr); Xher<double2,double>(queue, nullptr);
Xhpr<float2,float>(queue, nullptr); Xhpr<double2,double>(queue, nullptr);
Xher2<float2>(queue, nullptr); Xher2<double2>(queue, nullptr);
Xhpr2<float2>(queue, nullptr); Xhpr2<double2>(queue, nullptr);
Xsyr<float>(queue, nullptr); Xsyr<double>(queue, nullptr);
Xspr<float>(queue, nullptr); Xspr<double>(queue, nullptr);
Xsyr2<float>(queue, nullptr); Xsyr2<double>(queue, nullptr);
Xspr2<float>(queue, nullptr); Xspr2<double>(queue, nullptr);
// Runs all the level 3 set-up functions
Xgemm<float>(queue, nullptr).SetUp(); Xgemm<double>(queue, nullptr).SetUp(); Xgemm<float2>(queue, nullptr).SetUp(); Xgemm<double2>(queue, nullptr).SetUp();
Xsymm<float>(queue, nullptr).SetUp(); Xsymm<double>(queue, nullptr).SetUp(); Xsymm<float2>(queue, nullptr).SetUp(); Xsymm<double2>(queue, nullptr).SetUp();
Xhemm<float2>(queue, nullptr).SetUp(); Xhemm<double2>(queue, nullptr).SetUp();
Xsyrk<float>(queue, nullptr).SetUp(); Xsyrk<double>(queue, nullptr).SetUp(); Xsyrk<float2>(queue, nullptr).SetUp(); Xsyrk<double2>(queue, nullptr).SetUp();
Xherk<float2,float>(queue, nullptr).SetUp(); Xherk<double2,double>(queue, nullptr).SetUp();
Xsyr2k<float>(queue, nullptr).SetUp(); Xsyr2k<double>(queue, nullptr).SetUp(); Xsyr2k<float2>(queue, nullptr).SetUp(); Xsyr2k<double2>(queue, nullptr).SetUp();
Xher2k<float2,float>(queue, nullptr).SetUp(); Xher2k<double2,double>(queue, nullptr).SetUp();
Xtrmm<float>(queue, nullptr).SetUp(); Xtrmm<double>(queue, nullptr).SetUp(); Xtrmm<float2>(queue, nullptr).SetUp(); Xtrmm<double2>(queue, nullptr).SetUp();
Xgemm<float>(queue, nullptr); Xgemm<double>(queue, nullptr); Xgemm<float2>(queue, nullptr); Xgemm<double2>(queue, nullptr);
Xsymm<float>(queue, nullptr); Xsymm<double>(queue, nullptr); Xsymm<float2>(queue, nullptr); Xsymm<double2>(queue, nullptr);
Xhemm<float2>(queue, nullptr); Xhemm<double2>(queue, nullptr);
Xsyrk<float>(queue, nullptr); Xsyrk<double>(queue, nullptr); Xsyrk<float2>(queue, nullptr); Xsyrk<double2>(queue, nullptr);
Xherk<float2,float>(queue, nullptr); Xherk<double2,double>(queue, nullptr);
Xsyr2k<float>(queue, nullptr); Xsyr2k<double>(queue, nullptr); Xsyr2k<float2>(queue, nullptr); Xsyr2k<double2>(queue, nullptr);
Xher2k<float2,float>(queue, nullptr); Xher2k<double2,double>(queue, nullptr);
Xtrmm<float>(queue, nullptr); Xtrmm<double>(queue, nullptr); Xtrmm<float2>(queue, nullptr); Xtrmm<double2>(queue, nullptr);
// Runs all the level 3 set-up functions
Xomatcopy<float>(queue, nullptr).SetUp(); Xomatcopy<double>(queue, nullptr).SetUp(); Xomatcopy<float2>(queue, nullptr).SetUp(); Xomatcopy<double2>(queue, nullptr).SetUp();
Xomatcopy<float>(queue, nullptr); Xomatcopy<double>(queue, nullptr); Xomatcopy<float2>(queue, nullptr); Xomatcopy<double2>(queue, nullptr);
} catch (...) { return DispatchException(); }
return StatusCode::kSuccess;

View File

@ -21,10 +21,11 @@
namespace clblast {
// =================================================================================================
// Constructor: not much here, because no status codes can be returned
// The constructor does all heavy work, errors are returned as exceptions
Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
const std::vector<std::string> &routines, const Precision precision,
const std::vector<const Database::DatabaseEntry*> &userDatabase):
const std::vector<const Database::DatabaseEntry*> &userDatabase,
std::initializer_list<const char *> source):
precision_(precision),
routine_name_(name),
queue_(queue),
@ -33,12 +34,6 @@ Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
device_(queue_.GetDevice()),
device_name_(device_.Name()),
db_(queue_, routines, precision_, userDatabase) {
}
// =================================================================================================
// Separate set-up function to allow for status codes to be returned
void Routine::SetUp() {
// Queries the cache to see whether or not the program (context-specific) is already there
if (ProgramIsInCache(context_, precision_, routine_name_)) { return; }
@ -77,37 +72,39 @@ void Routine::SetUp() {
}
}
// Loads the common header (typedefs and defines and such)
std::string common_header =
#include "kernels/common.opencl"
;
// Collects the parameters for this device in the form of defines, and adds the precision
auto defines = db_.GetDefines();
defines += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";
auto source_string = db_.GetDefines();
source_string += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";
// Adds the name of the routine as a define
defines += "#define ROUTINE_"+routine_name_+"\n";
source_string += "#define ROUTINE_"+routine_name_+"\n";
// For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve
// performance, but might result in a reduced accuracy.
if (device_.IsAMD() && device_.IsGPU()) {
defines += "#define USE_CL_MAD 1\n";
source_string += "#define USE_CL_MAD 1\n";
}
// For specific devices, use staggered/shuffled workgroup indices.
if (device_.IsAMD() && device_.IsGPU()) {
defines += "#define USE_STAGGERED_INDICES 1\n";
source_string += "#define USE_STAGGERED_INDICES 1\n";
}
// For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
// performance through better cache behaviour
if (device_.IsARM() && device_.IsGPU()) {
defines += "#define GLOBAL_MEM_FENCE 1\n";
source_string += "#define GLOBAL_MEM_FENCE 1\n";
}
// Combines everything together into a single source string
const auto source_string = defines + common_header + source_string_;
// Loads the common header (typedefs and defines and such)
source_string +=
#include "kernels/common.opencl"
;
// Adds routine-specific code to the constructed source string
for (const char *s: source) {
source_string += s;
}
// Prints details of the routine to compile in case of debugging in verbose mode
#ifdef VERBOSE

View File

@ -34,21 +34,19 @@ class Routine {
// Base class constructor. The user database is an optional extra database to override the
// built-in database.
// All heavy preparation work is done inside this constructor.
explicit Routine(Queue &queue, EventPointer event, const std::string &name,
const std::vector<std::string> &routines, const Precision precision,
const std::vector<const Database::DatabaseEntry*> &userDatabase = {});
// Set-up phase of the kernel
void SetUp();
const std::vector<const Database::DatabaseEntry*> &userDatabase,
std::initializer_list<const char *> source);
protected:
// Non-static variable for the precision
const Precision precision_;
// The routine's name and its kernel-source in string form
// The routine's name
const std::string routine_name_;
std::string source_string_;
// The OpenCL objects, accessible only from derived classes
Queue queue_;

View File

@ -22,10 +22,9 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xamax<T>::Xamax(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
source_string_ =
Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/xamax.opencl"
;
}) {
}
// =================================================================================================

View File

@ -22,10 +22,9 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xasum<T>::Xasum(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
source_string_ =
Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/xasum.opencl"
;
}) {
}
// =================================================================================================

View File

@ -22,11 +22,10 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xaxpy<T>::Xaxpy(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
source_string_ =
Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/level1.opencl"
#include "../../kernels/level1/xaxpy.opencl"
;
}) {
}
// =================================================================================================

View File

@ -22,11 +22,10 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xcopy<T>::Xcopy(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
source_string_ =
Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/level1.opencl"
#include "../../kernels/level1/xcopy.opencl"
;
}) {
}
// =================================================================================================

View File

@ -22,10 +22,9 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xdot<T>::Xdot(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
source_string_ =
Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/xdot.opencl"
;
}) {
}
// =================================================================================================

View File

@ -22,10 +22,9 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xnrm2<T>::Xnrm2(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
source_string_ =
Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/xnrm2.opencl"
;
}) {
}
// =================================================================================================

View File

@ -22,11 +22,10 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xscal<T>::Xscal(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
source_string_ =
Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/level1.opencl"
#include "../../kernels/level1/xscal.opencl"
;
}) {
}
// =================================================================================================

View File

@ -22,11 +22,10 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xswap<T>::Xswap(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
source_string_ =
Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/level1.opencl"
#include "../../kernels/level1/xswap.opencl"
;
}) {
}
// =================================================================================================

View File

@ -22,11 +22,10 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xgemv<T>::Xgemv(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Pad", "Xgemv", "XgemvFast", "XgemvFastRot"}, PrecisionValue<T>()) {
source_string_ =
Routine(queue, event, name, {"Pad", "Xgemv", "XgemvFast", "XgemvFastRot"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level2/xgemv.opencl"
#include "../../kernels/level2/xgemv_fast.opencl"
;
}) {
}
// =================================================================================================

View File

@ -22,11 +22,10 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xger<T>::Xger(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Xger"}, PrecisionValue<T>()) {
source_string_ =
Routine(queue, event, name, {"Xger"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level2/level2.opencl"
#include "../../kernels/level2/xger.opencl"
;
}) {
}
// =================================================================================================

View File

@ -21,11 +21,10 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T, typename U>
Xher<T,U>::Xher(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Xger"}, PrecisionValue<T>()) {
source_string_ =
Routine(queue, event, name, {"Xger"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level2/level2.opencl"
#include "../../kernels/level2/xher.opencl"
;
}) {
}
// =================================================================================================

View File

@ -21,11 +21,10 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xher2<T>::Xher2(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Xger"}, PrecisionValue<T>()) {
source_string_ =
Routine(queue, event, name, {"Xger"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level2/level2.opencl"
#include "../../kernels/level2/xher2.opencl"
;
}) {
}
// =================================================================================================

View File

@ -24,8 +24,7 @@ template <typename T>
Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name,
{"Copy","Pad","Transpose","Padtranspose","Xgemm","XgemmDirect","KernelSelection"},
PrecisionValue<T>()) {
source_string_ =
PrecisionValue<T>(), {}, {
#include "../../kernels/level3/level3.opencl"
#include "../../kernels/level3/copy_fast.opencl"
#include "../../kernels/level3/copy_pad.opencl"
@ -37,13 +36,11 @@ Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
#include "../../kernels/level3/xgemm_direct_part1.opencl"
#include "../../kernels/level3/xgemm_direct_part2.opencl"
#include "../../kernels/level3/xgemm_direct_part3.opencl"
;
auto source_string_part_2 = // separated in two parts to prevent C1091 in MSVC 2013
, // separated in two parts to prevent C1091 in MSVC 2013
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
#include "../../kernels/level3/xgemm_part3.opencl"
;
source_string_ += source_string_part_2;
}) {
}
// =================================================================================================

View File

@ -22,8 +22,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T, typename U>
Xher2k<T,U>::Xher2k(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>()) {
source_string_ =
Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level3/level3.opencl"
#include "../../kernels/level3/copy_fast.opencl"
#include "../../kernels/level3/copy_pad.opencl"
@ -32,7 +31,7 @@ Xher2k<T,U>::Xher2k(Queue &queue, EventPointer event, const std::string &name):
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
#include "../../kernels/level3/xgemm_part3.opencl"
;
}) {
}
// =================================================================================================

View File

@ -22,8 +22,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T, typename U>
Xherk<T,U>::Xherk(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>()) {
source_string_ =
Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level3/level3.opencl"
#include "../../kernels/level3/copy_fast.opencl"
#include "../../kernels/level3/copy_pad.opencl"
@ -32,7 +31,7 @@ Xherk<T,U>::Xherk(Queue &queue, EventPointer event, const std::string &name):
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
#include "../../kernels/level3/xgemm_part3.opencl"
;
}) {
}
// =================================================================================================

View File

@ -22,8 +22,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xsyr2k<T>::Xsyr2k(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>()) {
source_string_ =
Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level3/level3.opencl"
#include "../../kernels/level3/copy_fast.opencl"
#include "../../kernels/level3/copy_pad.opencl"
@ -32,7 +31,7 @@ Xsyr2k<T>::Xsyr2k(Queue &queue, EventPointer event, const std::string &name):
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
#include "../../kernels/level3/xgemm_part3.opencl"
;
}) {
}
// =================================================================================================

View File

@ -22,8 +22,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xsyrk<T>::Xsyrk(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>()) {
source_string_ =
Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level3/level3.opencl"
#include "../../kernels/level3/copy_fast.opencl"
#include "../../kernels/level3/copy_pad.opencl"
@ -32,7 +31,7 @@ Xsyrk<T>::Xsyrk(Queue &queue, EventPointer event, const std::string &name):
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
#include "../../kernels/level3/xgemm_part3.opencl"
;
}) {
}
// =================================================================================================

View File

@ -22,14 +22,13 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xomatcopy<T>::Xomatcopy(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose"}, PrecisionValue<T>()) {
source_string_ =
Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level3/level3.opencl"
#include "../../kernels/level3/copy_fast.opencl"
#include "../../kernels/level3/copy_pad.opencl"
#include "../../kernels/level3/transpose_fast.opencl"
#include "../../kernels/level3/transpose_pad.opencl"
;
}) {
}
// =================================================================================================