The following code snippet is an example of how to do a wide mult using the minimum number of DSPs for the best timing performance.
This would take place of a simple A = B*C multiplication.
// Uncomment to do mult with a single '*' operator
// Uncomment this and run 'csim' to see that without casting upper product
// before shift design doesn't work correctly, because the expression
// 'tmp_a * y' has a width of (A_TOTAL_WIDTH + (B_TOTAL_WIDTH - 17)) = 42
// which is not adjusted to the necessary 59 bits before the LSH operation,
// (due to C semantics). This behavior is documented on pp 334-335 of UG902.
void wide_mult(in_a_t in_a, in_b_t in_b, out_p_t& out_p)
ap_uint<17> x = in_b.range(16,0);
ap_int<B_TOTAL_WIDTH - 17> y = in_b.range(B_TOTAL_WIDTH - 1, 17);// >> 17;
// effectively cast bits of in_a to an integer - does not affect ap_int<> case
ap_int<A_TOTAL_WIDTH> tmp_a = in_a.range(A_TOTAL_WIDTH - 1, 0);
#ifndef NOCAST_FAULT // cast below is necessary to ensure msbs are not lost during shift
ap_int<P_TOTAL_WIDTH> tmp_p = tmp_a * x + (ap_int<P_TOTAL_WIDTH>(tmp_a * y) << 17);
ap_int<P_TOTAL_WIDTH> tmp_p = tmp_a * x + (/*ap_int<P_TOTAL_WIDTH>*/(tmp_a * y) << 17);
// Effectively casts the integer product back into fixed point
// n.b. For ap_fixed<> I/O for this simple case the widths (W & IW; total and
// integer) were chosen to be the natural product width, i.e. the sum of the
// operand widths. For cases where the product widths are not simply the sum
// of the operand widths, scaling (shifting) of tmp_p before assigning to
// out_p.range() may be necessary.
out_p.range(P_TOTAL_WIDTH - 1,0) = tmp_p;
out_p = in_a * in_b;