Deep Neural Network Library (DNNL)  1.91.0
Performance library for Deep Learning
performance_profiling.cpp

Annotated version: Performance Profiling Example

/*******************************************************************************
* Copyright 2019 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include <chrono>
#include <iostream>
#include <stdexcept>
#include <vector>
#include "dnnl.hpp"
#include "example_utils.hpp"
using namespace dnnl;
// [Prologue]
// Set Strides and Padding
const memory::dims strides = {4, 4};
const memory::dims padding = {0, 0};
// [Prologue]
//
// function to init data
void init_data(memory &m, float v) {
size_t size = m.get_desc().get_size() / sizeof(float);
std::vector<float> data(size);
read_from_dnnl_memory(data.data(), m);
for (size_t i = 0; i < size; ++i)
data[i] = v;
}
// function to execute non-fused relu
void create_and_execute_relu(memory &data, engine &eng, stream &s) {
// relu operates on whatever data format is given to it
// create a primitive
algorithm::eltwise_relu, data.get_desc(), 0.f, 0.f);
auto relu_pd = eltwise_forward::primitive_desc(relu_d, eng);
auto relu = eltwise_forward(relu_pd);
// execute it (in-place)
relu.execute(s, {{DNNL_ARG_SRC, data}, {DNNL_ARG_DST, data}});
}
// [Create post_op attr with relu]
// function to create post-op attribute for fused relu
primitive_attr create_attr_with_relu_post_op() {
// create a post-op with relu
post_ops ops;
// create an attribute and set the corresponding post op
attr.set_post_ops(ops);
return attr;
}
// [Create post_op attr with relu]
// Implementation for naive convolution on nchw (data) and oihw (weights),
// followed by execution of non-fused relu
void conv_relu_naive(memory user_src, memory user_wei, memory user_dst,
engine &eng, stream &s) {
// [Create mem_desc]
// copy the dimensions and format from user's memory
auto conv_src_md = memory::desc(user_src.get_desc());
auto conv_wei_md = memory::desc(user_wei.get_desc());
auto conv_dst_md = memory::desc(user_dst.get_desc());
// [Create mem_desc]
// [Create conv_desc]
// create a convolution descriptor
algorithm::convolution_direct, conv_src_md, conv_wei_md,
conv_dst_md, strides, padding, padding);
// [Create conv_desc]
// [Create conv_prim_desc]
// create a convolution primitive descriptor
auto conv_pd = convolution_forward::primitive_desc(conv_d, eng);
// [Create conv_prim_desc]
// [Create conv_primitive]
// create convolution primitive
auto conv = convolution_forward(conv_pd);
// [Create conv_primitive]
// @page performance_profiling_cpp
// [Add to stream]
// execute convolution by adding it to the stream s
conv.execute(s,
{{DNNL_ARG_SRC, user_src}, {DNNL_ARG_WEIGHTS, user_wei},
{DNNL_ARG_DST, user_dst}});
// [Add to stream]
// [Create and execute relu]
// execute relu (on convolution's destination format, whatever it is)
create_and_execute_relu(user_dst, eng, s);
s.wait();
// [Create and execute relu]
}
// Implementation for convolution on blocked format for data and
// weights, followed by execution of non-fused relu
void conv_relu_blocked(memory user_src, memory user_wei, memory user_dst,
engine &eng, stream &s) {
// [Create mem_desc with tag=any]
// copy the dimensions and format from user's memory
auto conv_src_md = memory::desc(user_src.get_desc());
auto conv_wei_md = memory::desc(user_wei.get_desc());
auto conv_dst_md = memory::desc(user_dst.get_desc());
// reset format to "any" to allow convolution to pick the best implementation
// [Create mem_desc with tag=any]
// [Create conv_desc implementation2]
// create a convolution descriptor
algorithm::convolution_direct, conv_src_md, conv_wei_md,
conv_dst_md, strides, padding, padding);
// [Create conv_desc implementation2]
// [Create conv_prim_desc implementation2]
// create a convolution primitive descriptor and primitive
auto conv_pd = convolution_forward::primitive_desc(conv_d, eng);
// [Create conv_prim_desc implementation2]
// [Conditionally create and execute reorder prims]
// prepare convolution source
memory conv_src = user_src;
if (conv_pd.src_desc() != user_src.get_desc()) {
conv_src = memory(conv_pd.src_desc(), eng);
auto r_pd = reorder::primitive_desc(user_src, conv_src);
reorder(r_pd).execute(s, user_src, conv_src);
}
// prepare convolution weights
memory conv_wei = user_wei;
if (conv_pd.weights_desc() != user_wei.get_desc()) {
conv_wei = memory(conv_pd.weights_desc(), eng);
auto r_pd = reorder::primitive_desc(user_wei, conv_wei);
reorder(r_pd).execute(s, user_wei, conv_wei);
}
// prepare convolution destination
memory conv_dst = user_dst;
if (conv_pd.dst_desc() != user_dst.get_desc())
conv_dst = memory(conv_pd.dst_desc(), eng);
// [Conditionally create and execute reorder prims]
// [Create conv_primitive implementation2]
// create convolution primitive
auto conv = convolution_forward(conv_pd);
// [Create conv_primitive implementation2]
// [Add to stream implementation2]
// execute convolution by adding it to the stream s
conv.execute(s,
{{DNNL_ARG_SRC, conv_src}, {DNNL_ARG_WEIGHTS, conv_wei},
{DNNL_ARG_DST, conv_dst}});
// [Add to stream implementation2]
// [Create and execute relu implementation2]
// execute relu (on convolution's destination format, whatever it is)
create_and_execute_relu(conv_dst, eng, s);
// [Create and execute relu implementation2]
if (conv_pd.dst_desc() != user_dst.get_desc()) {
auto r_pd = reorder::primitive_desc(conv_dst, user_dst);
reorder(r_pd).execute(s, conv_dst, user_dst);
}
s.wait();
// reorder data to the user's format if needed.
}
// Implementation for convolution on blocked format for data and
// weights and the relu operation fused via a post-op attribute added to the
// convolution prim_descriptor
void conv_relu_fused(memory user_src, memory user_wei, memory user_dst,
engine eng, stream s) {
// copy the dimensions and format from user's memory
auto conv_src_md = memory::desc(user_src.get_desc());
auto conv_wei_md = memory::desc(user_wei.get_desc());
auto conv_dst_md = memory::desc(user_dst.get_desc());
// reset format to any to allow convolution to pick the best implementation
// create a convolution descriptor
algorithm::convolution_direct, conv_src_md, conv_wei_md,
conv_dst_md, strides, padding, padding);
// Next the convolution prim descriptor is created, which inherits the ReLU
// [Create prim_desc with attr]
// create an attribute for fused relu
auto attr = create_attr_with_relu_post_op();
// create a convolution primitive descriptor
auto conv_pd = convolution_forward::primitive_desc(conv_d, attr, eng);
// [Create prim_desc with attr]
// prepare convolution source
memory conv_src = user_src;
if (conv_pd.src_desc() != user_src.get_desc()) {
conv_src = memory(conv_pd.src_desc(), eng);
auto r_pd = reorder::primitive_desc(user_src, conv_src);
reorder(r_pd).execute(s, user_src, conv_src);
}
// prepare convolution weights
memory conv_wei = user_wei;
if (conv_pd.weights_desc() != user_wei.get_desc()) {
conv_wei = memory(conv_pd.weights_desc(), eng);
auto r_pd = reorder::primitive_desc(user_wei, conv_wei);
reorder(r_pd).execute(s, user_wei, conv_wei);
}
// prepare convolution destination
memory conv_dst = user_dst;
if (conv_pd.dst_desc() != user_dst.get_desc())
conv_dst = memory(conv_pd.dst_desc(), eng);
// [Create conv_primitive implementation3]
// create convolution primitive
auto conv = convolution_forward(conv_pd);
// [Create conv_primitive implementation3]
// [Add to stream implementation3]
// execute convolution by adding it to the stream s
conv.execute(s,
{{DNNL_ARG_SRC, conv_src}, {DNNL_ARG_WEIGHTS, conv_wei},
{DNNL_ARG_DST, conv_dst}});
// [Add to stream implementation3]
// reorder data to user's format if needed
if (conv_pd.dst_desc() != user_dst.get_desc()) {
auto r_pd = reorder::primitive_desc(conv_dst, user_dst);
reorder(r_pd).execute(s, conv_dst, user_dst);
}
s.wait();
}
void performance_profiling(engine::kind engine_kind, int argc, char **argv) {
// Initialize engine
engine eng(engine_kind, 0);
// Initialize stream
stream s(eng);
// [Set dimensions]
// set dimensions for synthetic data and weights
const memory::dim BATCH = 1000;
const memory::dim IC = 3, OC = 96;
const memory::dim IH = 227, KH = 11, OH = 55;
const memory::dim IW = 227, KW = 11, OW = 55;
// [Set dimensions]
// [Create memory objects]
// create DNNL memory objects for user's tensors (in nchw and oihw formats)
// @note here the library allocates memory
auto user_src = memory({{BATCH, IC, IH, IW}, memory::data_type::f32,
eng);
auto user_wei = memory({{OC, IC, KH, KW}, memory::data_type::f32,
eng);
auto user_dst = memory({{BATCH, OC, OH, OW}, memory::data_type::f32,
eng);
// [Create memory objects]
// fill source, destination, and weights with synthetic data
init_data(user_src, 1);
init_data(user_dst, -1);
init_data(user_wei, .5);
// set implementation ("naive"||"blocked"||"fused") setting implementation
// to "validation" will run all implementations
std::string implementation;
if (argc <= 2)
implementation = "validation";
else if (argc == 3)
implementation = argv[2];
if (!(implementation == "validation" || implementation == "naive"
|| implementation == "blocked" || implementation == "fused")) {
std::cout << "The implementation can be one of:\n";
std::cout << " - naive: NCHW format without fusion\n";
std::cout << " - blocked: format propagation without fusion\n";
std::cout << " - fused: format propagation with fusion\n";
std::cout << " - validation: runs all implementations\n\n";
std::cout << "Validation will run if no parameters are specified.\n\n";
throw std::invalid_argument("Incorrect input arguments.");
}
if (implementation == "naive" || implementation == "validation") {
std::cout << "Implementation: naive.\n";
// run conv + relu w/o fusing
conv_relu_naive(user_src, user_wei, user_dst, eng, s);
std::cout << "Conv + ReLU w/ nchw format completed.\n";
}
if (implementation == "blocked" || implementation == "validation") {
std::cout << "Implementation: blocked.\n";
// run conv + relu w/o fusing
conv_relu_blocked(user_src, user_wei, user_dst, eng, s);
std::cout << "Conv + ReLU w/ blocked format completed.\n";
}
if (implementation == "fused" || implementation == "validation") {
std::cout << "Implementation: fused.\n";
// run conv + relu w/ fusing
conv_relu_fused(user_src, user_wei, user_dst, eng, s);
std::cout << "Conv + ReLU w/ fusing completed.\n";
}
}
int main(int argc, char **argv) {
engine::kind engine_kind = parse_engine_kind(argc, argv, 1);
return handle_example_errors(
performance_profiling, engine_kind, argc, argv);
}
dnnl::reorder::execute
void execute(stream stream, memory &src, memory &dst) const
Executes the reorder primitive.
Definition: dnnl.hpp:2907
dnnl::memory::desc::get_size
size_t get_size() const
Returns size of the memory descriptor in bytes.
Definition: dnnl.hpp:2210
dnnl::reorder
Reorder primitive.
Definition: dnnl.hpp:2798
dnnl::stream
An execution stream.
Definition: dnnl.hpp:1433
dnnl::engine
An execution engine.
Definition: dnnl.hpp:1273
dnnl::memory::desc::data
dnnl_memory_desc_t data
The underlying C API data structure.
Definition: dnnl.hpp:2108
dnnl::stream::wait
stream & wait()
Waits for all primitives executing in the stream to finish.
Definition: dnnl.hpp:1501
dnnl::eltwise_forward
Elementwise unary operation forward propagation primitive.
Definition: dnnl.hpp:5173
dnnl::memory::data_type::f32
32-bit/single-precision floating point.
dnnl::eltwise_forward::desc
Descriptor for an elementwise forward propagation primitive.
Definition: dnnl.hpp:5175
dnnl::engine::kind
kind
Kinds of engines.
Definition: dnnl.hpp:1278
DNNL_ARG_DST
#define DNNL_ARG_DST
A special mnemonic for destination argument for primitives that have a single destination.
Definition: dnnl_types.h:1713
dnnl::memory::format_tag::oihw
4D CNN weights tensor; an alias for dnnl::memory::format_tag::abcd
dnnl::primitive_attr::set_post_ops
void set_post_ops(const post_ops ops)
Sets post-ops.
Definition: dnnl.hpp:1171
dnnl::eltwise_forward::primitive_desc
Primitive descriptor for an elementwise forward propagation primitive.
Definition: dnnl.hpp:5209
dnnl::post_ops
Post-ops.
Definition: dnnl.hpp:833
dnnl::convolution_forward
Convolution forward propagation primitive.
Definition: dnnl.hpp:3233
dnnl.hpp
dnnl::prop_kind::forward_inference
Forward data propagation (inference mode).
dnnl::memory::get_desc
desc get_desc() const
Returns the associated memory descriptor.
Definition: dnnl.hpp:2292
dnnl::algorithm::convolution_direct
Direct convolution.
dnnl::convolution_forward::primitive_desc
Primitive descriptor for a convolution forward propagation primitive.
Definition: dnnl.hpp:3439
DNNL_ARG_SRC
#define DNNL_ARG_SRC
A special mnemonic for source argument for primitives that have a single source.
Definition: dnnl_types.h:1689
dnnl_memory_desc_t::format_kind
dnnl_format_kind_t format_kind
Memory format kind.
Definition: dnnl_types.h:1010
dnnl::reorder::primitive_desc
Primitive descriptor for a reorder primitive.
Definition: dnnl.hpp:2800
dnnl::post_ops::append_eltwise
void append_eltwise(float scale, algorithm algorithm, float alpha, float beta)
Appends an elementwise post-op.
Definition: dnnl.hpp:908
dnnl::memory::dim
dnnl_dim_t dim
Integer type for representing dimension sizes and indices.
Definition: dnnl.hpp:1608
dnnl::algorithm::eltwise_relu
Elementwise: ReLU.
dnnl::primitive_attr
Primitive attributes.
Definition: dnnl.hpp:942
dnnl::memory
Memory object.
Definition: dnnl.hpp:1606
dnnl_format_kind_any
Unspecified format kind.
Definition: dnnl_types.h:84
dnnl::memory::dims
std::vector< dim > dims
Vector of dimensions.
Definition: dnnl.hpp:1611
dnnl::memory::desc
A memory descriptor.
Definition: dnnl.hpp:2105
dnnl::convolution_forward::desc
Descriptor for a convolution forward propagation primitive.
Definition: dnnl.hpp:3235
dnnl
DNNL namespace.
Definition: dnnl.hpp:81
dnnl::memory::format_tag::nchw
4D CNN activations tensor; an alias for dnnl::memory::format_tag::abcd
DNNL_ARG_WEIGHTS
#define DNNL_ARG_WEIGHTS
A special mnemonic for primitives that have a single weights argument.
Definition: dnnl_types.h:1736