2012-12-02 08:10:19 -05:00
//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
2013-06-10 16:36:52 -04:00
// and generates target-independent LLVM-IR.
// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
// of instructions in order to estimate the profitability of vectorization.
2012-12-02 08:10:19 -05:00
//
2013-04-08 14:41:23 -04:00
// The loop vectorizer combines consecutive loop iterations into a single
2012-12-02 08:10:19 -05:00
// 'wide' iteration. After this transformation the index is incremented
// by the SIMD vector width, and not by one.
//
// This pass has three parts:
// 1. The main loop pass that drives the different parts.
// 2. LoopVectorizationLegality - A unit that checks for the legality
// of the vectorization.
2013-04-08 14:41:23 -04:00
// 3. InnerLoopVectorizer - A unit that performs the actual
2012-12-02 08:10:19 -05:00
// widening of instructions.
// 4. LoopVectorizationCostModel - A unit that checks for the profitability
// of vectorization. It decides on the optimal vector width, which
// can be one, if vectorization is not profitable.
2013-04-08 14:41:23 -04:00
//
2012-12-02 08:10:19 -05:00
//===----------------------------------------------------------------------===//
//
// The reduction-variable vectorization is based on the paper:
// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
//
// Variable uniformity checks are inspired by:
2013-04-08 14:41:23 -04:00
// Karrenberg, R. and Hack, S. Whole Function Vectorization.
2012-12-02 08:10:19 -05:00
//
// Other ideas/concepts are from:
// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
//
2013-04-08 14:41:23 -04:00
// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
// Vectorizing Compilers.
//
2012-12-02 08:10:19 -05:00
//===----------------------------------------------------------------------===//
2013-04-08 14:41:23 -04:00
# include "llvm/Transforms/Vectorize.h"
# include "llvm/ADT/DenseMap.h"
2013-12-21 19:04:03 -05:00
# include "llvm/ADT/EquivalenceClasses.h"
# include "llvm/ADT/Hashing.h"
2013-04-08 14:41:23 -04:00
# include "llvm/ADT/MapVector.h"
2013-12-21 19:04:03 -05:00
# include "llvm/ADT/SetVector.h"
2013-04-08 14:41:23 -04:00
# include "llvm/ADT/SmallPtrSet.h"
# include "llvm/ADT/SmallSet.h"
2012-12-02 08:10:19 -05:00
# include "llvm/ADT/SmallVector.h"
2014-11-24 04:08:18 -05:00
# include "llvm/ADT/Statistic.h"
2012-12-02 08:10:19 -05:00
# include "llvm/ADT/StringExtras.h"
# include "llvm/Analysis/AliasAnalysis.h"
2014-11-24 04:08:18 -05:00
# include "llvm/Analysis/AliasSetTracker.h"
2015-01-18 11:17:27 -05:00
# include "llvm/Analysis/AssumptionCache.h"
2014-11-24 04:08:18 -05:00
# include "llvm/Analysis/BlockFrequencyInfo.h"
2015-01-18 11:17:27 -05:00
# include "llvm/Analysis/CodeMetrics.h"
2012-12-02 08:10:19 -05:00
# include "llvm/Analysis/LoopInfo.h"
2013-04-08 14:41:23 -04:00
# include "llvm/Analysis/LoopIterator.h"
# include "llvm/Analysis/LoopPass.h"
# include "llvm/Analysis/ScalarEvolution.h"
# include "llvm/Analysis/ScalarEvolutionExpander.h"
# include "llvm/Analysis/ScalarEvolutionExpressions.h"
# include "llvm/Analysis/TargetTransformInfo.h"
2012-12-02 08:10:19 -05:00
# include "llvm/Analysis/ValueTracking.h"
2013-04-08 14:41:23 -04:00
# include "llvm/IR/Constants.h"
# include "llvm/IR/DataLayout.h"
2014-11-24 04:08:18 -05:00
# include "llvm/IR/DebugInfo.h"
2013-04-08 14:41:23 -04:00
# include "llvm/IR/DerivedTypes.h"
2014-11-24 04:08:18 -05:00
# include "llvm/IR/DiagnosticInfo.h"
# include "llvm/IR/Dominators.h"
2013-04-08 14:41:23 -04:00
# include "llvm/IR/Function.h"
# include "llvm/IR/IRBuilder.h"
# include "llvm/IR/Instructions.h"
# include "llvm/IR/IntrinsicInst.h"
# include "llvm/IR/LLVMContext.h"
# include "llvm/IR/Module.h"
2014-11-24 04:08:18 -05:00
# include "llvm/IR/PatternMatch.h"
2013-04-08 14:41:23 -04:00
# include "llvm/IR/Type.h"
# include "llvm/IR/Value.h"
2014-11-24 04:08:18 -05:00
# include "llvm/IR/ValueHandle.h"
# include "llvm/IR/Verifier.h"
2013-04-08 14:41:23 -04:00
# include "llvm/Pass.h"
2014-11-24 04:08:18 -05:00
# include "llvm/Support/BranchProbability.h"
2012-12-02 08:10:19 -05:00
# include "llvm/Support/CommandLine.h"
# include "llvm/Support/Debug.h"
# include "llvm/Support/raw_ostream.h"
2013-04-08 14:41:23 -04:00
# include "llvm/Transforms/Scalar.h"
# include "llvm/Transforms/Utils/BasicBlockUtils.h"
2012-12-02 08:10:19 -05:00
# include "llvm/Transforms/Utils/Local.h"
2014-11-24 04:08:18 -05:00
# include "llvm/Transforms/Utils/VectorUtils.h"
2012-12-02 08:10:19 -05:00
# include <algorithm>
2013-04-08 14:41:23 -04:00
# include <map>
2014-11-24 04:08:18 -05:00
# include <tuple>
2013-04-08 14:41:23 -04:00
2012-12-02 08:10:19 -05:00
using namespace llvm ;
2013-06-10 16:36:52 -04:00
using namespace llvm : : PatternMatch ;
2012-12-02 08:10:19 -05:00
2014-11-24 04:08:18 -05:00
# define LV_NAME "loop-vectorize"
# define DEBUG_TYPE LV_NAME
STATISTIC ( LoopsVectorized , " Number of loops vectorized " ) ;
STATISTIC ( LoopsAnalyzed , " Number of loops analyzed for vectorization " ) ;
2012-12-02 08:10:19 -05:00
static cl : : opt < unsigned >
VectorizationFactor ( " force-vector-width " , cl : : init ( 0 ) , cl : : Hidden ,
2013-04-08 14:41:23 -04:00
cl : : desc ( " Sets the SIMD width. Zero is autoselect. " ) ) ;
static cl : : opt < unsigned >
2015-01-18 11:17:27 -05:00
VectorizationInterleave ( " force-vector-interleave " , cl : : init ( 0 ) , cl : : Hidden ,
cl : : desc ( " Sets the vectorization interleave count. "
2013-04-08 14:41:23 -04:00
" Zero is autoselect. " ) ) ;
static cl : : opt < bool >
EnableIfConversion ( " enable-if-conversion " , cl : : init ( true ) , cl : : Hidden ,
cl : : desc ( " Enable if-conversion during vectorization. " ) ) ;
2012-12-02 08:10:19 -05:00
/// We don't vectorize loops with a known constant trip count below this number.
2013-04-08 14:41:23 -04:00
static cl : : opt < unsigned >
TinyTripCountVectorThreshold ( " vectorizer-min-trip-count " , cl : : init ( 16 ) ,
cl : : Hidden ,
cl : : desc ( " Don't vectorize loops with a constant "
" trip count that is smaller than this "
" value. " ) ) ;
2014-11-24 04:08:18 -05:00
/// This enables versioning on the strides of symbolically striding memory
/// accesses in code like the following.
/// for (i = 0; i < N; ++i)
/// A[i * Stride1] += B[i * Stride2] ...
///
/// Will be roughly translated to
/// if (Stride1 == 1 && Stride2 == 1) {
/// for (i = 0; i < N; i+=4)
/// A[i:i+3] += ...
/// } else
/// ...
static cl : : opt < bool > EnableMemAccessVersioning (
" enable-mem-access-versioning " , cl : : init ( true ) , cl : : Hidden ,
cl : : desc ( " Enable symblic stride memory access versioning " ) ) ;
2013-04-08 14:41:23 -04:00
/// We don't unroll loops with a known constant trip count below this number.
static const unsigned TinyTripCountUnrollThreshold = 128 ;
2012-12-02 08:10:19 -05:00
2013-06-10 16:36:52 -04:00
/// When performing memory disambiguation checks at runtime do not make more
/// than this number of comparisons.
static const unsigned RuntimeMemoryCheckThreshold = 8 ;
2013-04-08 14:41:23 -04:00
2013-12-21 19:04:03 -05:00
/// Maximum simd width.
static const unsigned MaxVectorWidth = 64 ;
2014-11-24 04:08:18 -05:00
static cl : : opt < unsigned > ForceTargetNumScalarRegs (
" force-target-num-scalar-regs " , cl : : init ( 0 ) , cl : : Hidden ,
cl : : desc ( " A flag that overrides the target's number of scalar registers. " ) ) ;
static cl : : opt < unsigned > ForceTargetNumVectorRegs (
" force-target-num-vector-regs " , cl : : init ( 0 ) , cl : : Hidden ,
cl : : desc ( " A flag that overrides the target's number of vector registers. " ) ) ;
2015-01-18 11:17:27 -05:00
/// Maximum vectorization interleave count.
static const unsigned MaxInterleaveFactor = 16 ;
2013-12-21 19:04:03 -05:00
2015-01-18 11:17:27 -05:00
static cl : : opt < unsigned > ForceTargetMaxScalarInterleaveFactor (
" force-target-max-scalar-interleave " , cl : : init ( 0 ) , cl : : Hidden ,
cl : : desc ( " A flag that overrides the target's max interleave factor for "
" scalar loops. " ) ) ;
2014-11-24 04:08:18 -05:00
2015-01-18 11:17:27 -05:00
static cl : : opt < unsigned > ForceTargetMaxVectorInterleaveFactor (
" force-target-max-vector-interleave " , cl : : init ( 0 ) , cl : : Hidden ,
cl : : desc ( " A flag that overrides the target's max interleave factor for "
2014-11-24 04:08:18 -05:00
" vectorized loops. " ) ) ;
static cl : : opt < unsigned > ForceTargetInstructionCost (
" force-target-instruction-cost " , cl : : init ( 0 ) , cl : : Hidden ,
cl : : desc ( " A flag that overrides the target's expected cost for "
" an instruction to a single constant value. Mostly "
" useful for getting consistent testing. " ) ) ;
static cl : : opt < unsigned > SmallLoopCost (
" small-loop-cost " , cl : : init ( 20 ) , cl : : Hidden ,
cl : : desc ( " The cost of a loop that is considered 'small' by the unroller. " ) ) ;
static cl : : opt < bool > LoopVectorizeWithBlockFrequency (
" loop-vectorize-with-block-frequency " , cl : : init ( false ) , cl : : Hidden ,
cl : : desc ( " Enable the use of the block frequency analysis to access PGO "
" heuristics minimizing code growth in cold regions and being more "
" aggressive in hot regions. " ) ) ;
// Runtime unroll loops for load/store throughput.
static cl : : opt < bool > EnableLoadStoreRuntimeUnroll (
" enable-loadstore-runtime-unroll " , cl : : init ( true ) , cl : : Hidden ,
cl : : desc ( " Enable runtime unrolling until load/store ports are saturated " ) ) ;
/// The number of stores in a loop that are allowed to need predication.
static cl : : opt < unsigned > NumberOfStoresToPredicate (
" vectorize-num-stores-pred " , cl : : init ( 1 ) , cl : : Hidden ,
cl : : desc ( " Max number of stores to be predicated behind an if. " ) ) ;
static cl : : opt < bool > EnableIndVarRegisterHeur (
" enable-ind-var-reg-heur " , cl : : init ( true ) , cl : : Hidden ,
cl : : desc ( " Count the induction variable only once when unrolling " ) ) ;
static cl : : opt < bool > EnableCondStoresVectorization (
" enable-cond-stores-vec " , cl : : init ( false ) , cl : : Hidden ,
cl : : desc ( " Enable if predication of stores during vectorization. " ) ) ;
2012-12-02 08:10:19 -05:00
2015-01-18 11:17:27 -05:00
static cl : : opt < unsigned > MaxNestedScalarReductionUF (
" max-nested-scalar-reduction-unroll " , cl : : init ( 2 ) , cl : : Hidden ,
cl : : desc ( " The maximum unroll factor to use when unrolling a scalar "
" reduction in a nested loop. " ) ) ;
2012-12-02 08:10:19 -05:00
namespace {
// Forward declarations.
class LoopVectorizationLegality ;
class LoopVectorizationCostModel ;
2015-01-18 11:17:27 -05:00
class LoopVectorizeHints ;
2012-12-02 08:10:19 -05:00
2014-11-24 04:08:18 -05:00
/// Optimization analysis message produced during vectorization. Messages inform
/// the user why vectorization did not occur.
class Report {
std : : string Message ;
raw_string_ostream Out ;
Instruction * Instr ;
public :
Report ( Instruction * I = nullptr ) : Out ( Message ) , Instr ( I ) {
Out < < " loop not vectorized: " ;
}
template < typename A > Report & operator < < ( const A & Value ) {
Out < < Value ;
return * this ;
}
Instruction * getInstr ( ) { return Instr ; }
std : : string & str ( ) { return Out . str ( ) ; }
operator Twine ( ) { return Out . str ( ) ; }
} ;
2013-04-08 14:41:23 -04:00
/// InnerLoopVectorizer vectorizes loops which contain only one basic
2012-12-02 08:10:19 -05:00
/// block to a specified vectorization factor (VF).
/// This class performs the widening of scalars into vectors, or multiple
/// scalars. This class also implements the following features:
/// * It inserts an epilogue loop for handling loops that don't have iteration
/// counts that are known to be a multiple of the vectorization factor.
/// * It handles the code generation for reduction variables.
/// * Scalarization (implementation using scalars) of un-vectorizable
/// instructions.
2013-04-08 14:41:23 -04:00
/// InnerLoopVectorizer does not perform any vectorization-legality
2012-12-02 08:10:19 -05:00
/// checks, and relies on the caller to check for the different legality
2013-04-08 14:41:23 -04:00
/// aspects. The InnerLoopVectorizer relies on the
2012-12-02 08:10:19 -05:00
/// LoopVectorizationLegality class to provide information about the induction
/// and reduction variables that were found to a given vectorization factor.
2013-04-08 14:41:23 -04:00
class InnerLoopVectorizer {
2012-12-02 08:10:19 -05:00
public :
2013-04-08 14:41:23 -04:00
InnerLoopVectorizer ( Loop * OrigLoop , ScalarEvolution * SE , LoopInfo * LI ,
2014-11-24 04:08:18 -05:00
DominatorTree * DT , const DataLayout * DL ,
2013-04-08 14:41:23 -04:00
const TargetLibraryInfo * TLI , unsigned VecWidth ,
unsigned UnrollFactor )
: OrigLoop ( OrigLoop ) , SE ( SE ) , LI ( LI ) , DT ( DT ) , DL ( DL ) , TLI ( TLI ) ,
2014-11-24 04:08:18 -05:00
VF ( VecWidth ) , UF ( UnrollFactor ) , Builder ( SE - > getContext ( ) ) ,
Induction ( nullptr ) , OldInduction ( nullptr ) , WidenMap ( UnrollFactor ) ,
Legal ( nullptr ) { }
2012-12-02 08:10:19 -05:00
// Perform the actual loop widening (vectorization).
2014-11-24 04:08:18 -05:00
void vectorize ( LoopVectorizationLegality * L ) {
Legal = L ;
2013-04-08 14:41:23 -04:00
// Create a new empty loop. Unlink the old loop and connect the new one.
2014-11-24 04:08:18 -05:00
createEmptyLoop ( ) ;
2013-04-08 14:41:23 -04:00
// Widen each instruction in the old loop to a new one in the new loop.
// Use the Legality module to find the induction and reduction variables.
2014-11-24 04:08:18 -05:00
vectorizeLoop ( ) ;
2012-12-02 08:10:19 -05:00
// Register the new loop and update the analysis passes.
updateAnalysis ( ) ;
2013-04-08 14:41:23 -04:00
}
2012-12-02 08:10:19 -05:00
2013-12-21 19:04:03 -05:00
virtual ~ InnerLoopVectorizer ( ) { }
protected :
2013-04-08 14:41:23 -04:00
/// A small list of PHINodes.
typedef SmallVector < PHINode * , 4 > PhiVector ;
/// When we unroll loops we have multiple vector values for each scalar.
/// This data structure holds the unrolled and vectorized values that
/// originated from one scalar instruction.
typedef SmallVector < Value * , 2 > VectorParts ;
2013-12-21 19:04:03 -05:00
// When we if-convert we need create edge masks. We have to cache values so
// that we don't end up with exponential recursion/IR.
typedef DenseMap < std : : pair < BasicBlock * , BasicBlock * > ,
VectorParts > EdgeMaskCache ;
2014-11-24 04:08:18 -05:00
/// \brief Add code that checks at runtime if the accessed arrays overlap.
///
/// Returns a pair of instructions where the first element is the first
/// instruction generated in possibly a sequence of instructions and the
/// second value is the final comparator value or NULL if no check is needed.
std : : pair < Instruction * , Instruction * > addRuntimeCheck ( Instruction * Loc ) ;
/// \brief Add checks for strides that where assumed to be 1.
///
/// Returns the last check instruction and the first check instruction in the
/// pair as (first, last).
std : : pair < Instruction * , Instruction * > addStrideCheck ( Instruction * Loc ) ;
2012-12-02 08:10:19 -05:00
/// Create an empty loop, based on the loop ranges of the old loop.
2014-11-24 04:08:18 -05:00
void createEmptyLoop ( ) ;
2012-12-02 08:10:19 -05:00
/// Copy and widen the instructions from the old loop.
2014-11-24 04:08:18 -05:00
virtual void vectorizeLoop ( ) ;
2013-12-21 19:04:03 -05:00
/// \brief The Loop exit block may have single value PHI nodes where the
/// incoming value is 'Undef'. While vectorizing we only handled real values
/// that were defined inside the loop. Here we fix the 'undef case'.
/// See PR14725.
void fixLCSSAPHIs ( ) ;
2013-04-08 14:41:23 -04:00
/// A helper function that computes the predicate of the block BB, assuming
/// that the header block of the loop is set to True. It returns the *entry*
/// mask for the block BB.
VectorParts createBlockInMask ( BasicBlock * BB ) ;
/// A helper function that computes the predicate of the edge between SRC
/// and DST.
VectorParts createEdgeMask ( BasicBlock * Src , BasicBlock * Dst ) ;
/// A helper function to vectorize a single BB within the innermost loop.
2014-11-24 04:08:18 -05:00
void vectorizeBlockInLoop ( BasicBlock * BB , PhiVector * PV ) ;
2013-04-08 14:41:23 -04:00
2013-12-21 19:04:03 -05:00
/// Vectorize a single PHINode in a block. This method handles the induction
/// variable canonicalization. It supports both VF = 1 for unrolled loops and
/// arbitrary length vectors.
void widenPHIInstruction ( Instruction * PN , VectorParts & Entry ,
unsigned UF , unsigned VF , PhiVector * PV ) ;
2012-12-02 08:10:19 -05:00
/// Insert the new loop to the loop hierarchy and pass manager
/// and update the analysis passes.
void updateAnalysis ( ) ;
/// This instruction is un-vectorizable. Implement it as a sequence
2014-11-24 04:08:18 -05:00
/// of scalars. If \p IfPredicateStore is true we need to 'hide' each
/// scalarized instruction behind an if block predicated on the control
/// dependence of the instruction.
virtual void scalarizeInstruction ( Instruction * Instr ,
bool IfPredicateStore = false ) ;
2012-12-02 08:10:19 -05:00
2013-04-08 14:41:23 -04:00
/// Vectorize Load and Store instructions,
2014-11-24 04:08:18 -05:00
virtual void vectorizeMemoryInstruction ( Instruction * Instr ) ;
2013-04-08 14:41:23 -04:00
2012-12-02 08:10:19 -05:00
/// Create a broadcast instruction. This method generates a broadcast
/// instruction (shuffle) for loop invariant values and for the induction
/// value. If this is the induction variable then we extend it to N, N+1, ...
/// this is needed because each iteration in the loop corresponds to a SIMD
/// element.
2013-12-21 19:04:03 -05:00
virtual Value * getBroadcastInstrs ( Value * V ) ;
2012-12-02 08:10:19 -05:00
2013-04-08 14:41:23 -04:00
/// This function adds 0, 1, 2 ... to each vector element, starting at zero.
/// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...).
/// The sequence starts at StartIndex.
2013-12-21 19:04:03 -05:00
virtual Value * getConsecutiveVector ( Value * Val , int StartIdx , bool Negate ) ;
2012-12-02 08:10:19 -05:00
/// When we go over instructions in the basic block we rely on previous
/// values within the current basic block or on loop invariant values.
/// When we widen (vectorize) values we place them in the map. If the values
/// are not within the map, they have to be loop invariant, so we simply
/// broadcast them into a vector.
2013-04-08 14:41:23 -04:00
VectorParts & getVectorValue ( Value * V ) ;
/// Generate a shuffle sequence that will reverse the vector Vec.
2013-12-21 19:04:03 -05:00
virtual Value * reverseVector ( Value * Vec ) ;
2013-04-08 14:41:23 -04:00
/// This is a helper class that holds the vectorizer state. It maps scalar
/// instructions to vector instructions. When the code is 'unrolled' then
/// then a single scalar value is mapped to multiple vector parts. The parts
/// are stored in the VectorPart type.
struct ValueMap {
/// C'tor. UnrollFactor controls the number of vectors ('parts') that
/// are mapped.
ValueMap ( unsigned UnrollFactor ) : UF ( UnrollFactor ) { }
/// \return True if 'Key' is saved in the Value Map.
bool has ( Value * Key ) const { return MapStorage . count ( Key ) ; }
/// Initializes a new entry in the map. Sets all of the vector parts to the
/// save value in 'Val'.
/// \return A reference to a vector with splat values.
VectorParts & splat ( Value * Key , Value * Val ) {
VectorParts & Entry = MapStorage [ Key ] ;
Entry . assign ( UF , Val ) ;
return Entry ;
}
///\return A reference to the value that is stored at 'Key'.
VectorParts & get ( Value * Key ) {
VectorParts & Entry = MapStorage [ Key ] ;
if ( Entry . empty ( ) )
Entry . resize ( UF ) ;
assert ( Entry . size ( ) = = UF ) ;
return Entry ;
}
2012-12-02 08:10:19 -05:00
2013-04-08 14:41:23 -04:00
private :
/// The unroll factor. Each entry in the map stores this number of vector
/// elements.
unsigned UF ;
2012-12-02 08:10:19 -05:00
2013-04-08 14:41:23 -04:00
/// Map storage. We use std::map and not DenseMap because insertions to a
/// dense map invalidates its iterators.
std : : map < Value * , VectorParts > MapStorage ;
} ;
2012-12-02 08:10:19 -05:00
/// The original loop.
Loop * OrigLoop ;
2013-04-08 14:41:23 -04:00
/// Scev analysis to use.
2012-12-02 08:10:19 -05:00
ScalarEvolution * SE ;
2013-04-08 14:41:23 -04:00
/// Loop Info.
2012-12-02 08:10:19 -05:00
LoopInfo * LI ;
2013-04-08 14:41:23 -04:00
/// Dominator Tree.
2012-12-02 08:10:19 -05:00
DominatorTree * DT ;
2014-11-24 04:08:18 -05:00
/// Alias Analysis.
AliasAnalysis * AA ;
2013-04-08 14:41:23 -04:00
/// Data Layout.
2014-11-24 04:08:18 -05:00
const DataLayout * DL ;
2013-04-08 14:41:23 -04:00
/// Target Library Info.
const TargetLibraryInfo * TLI ;
/// The vectorization SIMD factor to use. Each vector will have this many
/// vector elements.
2012-12-02 08:10:19 -05:00
unsigned VF ;
2013-12-21 19:04:03 -05:00
protected :
2013-04-08 14:41:23 -04:00
/// The vectorization unroll factor to use. Each scalar is vectorized to this
/// many different vector instructions.
unsigned UF ;
2012-12-02 08:10:19 -05:00
2013-04-08 14:41:23 -04:00
/// The builder that we use
2012-12-02 08:10:19 -05:00
IRBuilder < > Builder ;
// --- Vectorization state ---
/// The vector-loop preheader.
BasicBlock * LoopVectorPreHeader ;
/// The scalar-loop preheader.
BasicBlock * LoopScalarPreHeader ;
/// Middle Block between the vector and the scalar.
BasicBlock * LoopMiddleBlock ;
///The ExitBlock of the scalar loop.
BasicBlock * LoopExitBlock ;
///The vector loop body.
2014-11-24 04:08:18 -05:00
SmallVector < BasicBlock * , 4 > LoopVectorBody ;
2012-12-02 08:10:19 -05:00
///The scalar loop body.
BasicBlock * LoopScalarBody ;
2013-04-08 14:41:23 -04:00
/// A list of all bypass blocks. The first block is the entry of the loop.
SmallVector < BasicBlock * , 4 > LoopBypassBlocks ;
2012-12-02 08:10:19 -05:00
/// The new Induction variable which was added to the new block.
PHINode * Induction ;
/// The induction variable of the old basic block.
PHINode * OldInduction ;
2013-12-21 19:04:03 -05:00
/// Holds the extended (to the widest induction type) start index.
Value * ExtendedIdx ;
2013-04-08 14:41:23 -04:00
/// Maps scalars to widened vectors.
2012-12-02 08:10:19 -05:00
ValueMap WidenMap ;
2013-12-21 19:04:03 -05:00
EdgeMaskCache MaskCache ;
2014-11-24 04:08:18 -05:00
LoopVectorizationLegality * Legal ;
2012-12-02 08:10:19 -05:00
} ;
2013-12-21 19:04:03 -05:00
class InnerLoopUnroller : public InnerLoopVectorizer {
public :
InnerLoopUnroller ( Loop * OrigLoop , ScalarEvolution * SE , LoopInfo * LI ,
2014-11-24 04:08:18 -05:00
DominatorTree * DT , const DataLayout * DL ,
2013-12-21 19:04:03 -05:00
const TargetLibraryInfo * TLI , unsigned UnrollFactor ) :
InnerLoopVectorizer ( OrigLoop , SE , LI , DT , DL , TLI , 1 , UnrollFactor ) { }
private :
2014-11-24 04:08:18 -05:00
void scalarizeInstruction ( Instruction * Instr ,
bool IfPredicateStore = false ) override ;
void vectorizeMemoryInstruction ( Instruction * Instr ) override ;
Value * getBroadcastInstrs ( Value * V ) override ;
Value * getConsecutiveVector ( Value * Val , int StartIdx , bool Negate ) override ;
Value * reverseVector ( Value * Vec ) override ;
2013-12-21 19:04:03 -05:00
} ;
/// \brief Look for a meaningful debug location on the instruction or it's
/// operands.
static Instruction * getDebugLocFromInstOrOperands ( Instruction * I ) {
if ( ! I )
return I ;
DebugLoc Empty ;
if ( I - > getDebugLoc ( ) ! = Empty )
return I ;
for ( User : : op_iterator OI = I - > op_begin ( ) , OE = I - > op_end ( ) ; OI ! = OE ; + + OI ) {
if ( Instruction * OpInst = dyn_cast < Instruction > ( * OI ) )
if ( OpInst - > getDebugLoc ( ) ! = Empty )
return OpInst ;
}
return I ;
}
/// \brief Set the debug location in the builder using the debug location in the
/// instruction.
static void setDebugLocFromInst ( IRBuilder < > & B , const Value * Ptr ) {
if ( const Instruction * Inst = dyn_cast_or_null < Instruction > ( Ptr ) )
B . SetCurrentDebugLocation ( Inst - > getDebugLoc ( ) ) ;
else
B . SetCurrentDebugLocation ( DebugLoc ( ) ) ;
}
2014-11-24 04:08:18 -05:00
# ifndef NDEBUG
/// \return string containing a file name and a line # for the given loop.
static std : : string getDebugLocString ( const Loop * L ) {
std : : string Result ;
if ( L ) {
raw_string_ostream OS ( Result ) ;
const DebugLoc LoopDbgLoc = L - > getStartLoc ( ) ;
if ( ! LoopDbgLoc . isUnknown ( ) )
LoopDbgLoc . print ( L - > getHeader ( ) - > getContext ( ) , OS ) ;
else
// Just print the module name.
OS < < L - > getHeader ( ) - > getParent ( ) - > getParent ( ) - > getModuleIdentifier ( ) ;
OS . flush ( ) ;
}
return Result ;
}
# endif
/// \brief Propagate known metadata from one instruction to another.
static void propagateMetadata ( Instruction * To , const Instruction * From ) {
SmallVector < std : : pair < unsigned , MDNode * > , 4 > Metadata ;
From - > getAllMetadataOtherThanDebugLoc ( Metadata ) ;
for ( auto M : Metadata ) {
unsigned Kind = M . first ;
// These are safe to transfer (this is safe for TBAA, even when we
// if-convert, because should that metadata have had a control dependency
// on the condition, and thus actually aliased with some other
// non-speculated memory access when the condition was false, this would be
// caught by the runtime overlap checks).
if ( Kind ! = LLVMContext : : MD_tbaa & &
2015-01-18 11:17:27 -05:00
Kind ! = LLVMContext : : MD_alias_scope & &
Kind ! = LLVMContext : : MD_noalias & &
2014-11-24 04:08:18 -05:00
Kind ! = LLVMContext : : MD_fpmath )
continue ;
To - > setMetadata ( Kind , M . second ) ;
}
}
/// \brief Propagate known metadata from one instruction to a vector of others.
static void propagateMetadata ( SmallVectorImpl < Value * > & To , const Instruction * From ) {
for ( Value * V : To )
if ( Instruction * I = dyn_cast < Instruction > ( V ) )
propagateMetadata ( I , From ) ;
}
2012-12-02 08:10:19 -05:00
/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
/// to what vectorization factor.
/// This class does not look at the profitability of vectorization, only the
/// legality. This class has two main kinds of checks:
/// * Memory checks - The code in canVectorizeMemory checks if vectorization
/// will change the order of memory accesses in a way that will change the
/// correctness of the program.
2013-04-08 14:41:23 -04:00
/// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory
/// checks for a number of different conditions, such as the availability of a
/// single induction variable, that all types are supported and vectorize-able,
/// etc. This code reflects the capabilities of InnerLoopVectorizer.
/// This class is also used by InnerLoopVectorizer for identifying
2012-12-02 08:10:19 -05:00
/// induction variable and the different reduction variables.
class LoopVectorizationLegality {
public :
2014-11-24 04:08:18 -05:00
unsigned NumLoads ;
unsigned NumStores ;
unsigned NumPredStores ;
LoopVectorizationLegality ( Loop * L , ScalarEvolution * SE , const DataLayout * DL ,
DominatorTree * DT , TargetLibraryInfo * TLI ,
2015-01-18 11:17:27 -05:00
AliasAnalysis * AA , Function * F ,
const TargetTransformInfo * TTI )
2014-11-24 04:08:18 -05:00
: NumLoads ( 0 ) , NumStores ( 0 ) , NumPredStores ( 0 ) , TheLoop ( L ) , SE ( SE ) , DL ( DL ) ,
2015-01-18 11:17:27 -05:00
DT ( DT ) , TLI ( TLI ) , AA ( AA ) , TheFunction ( F ) , TTI ( TTI ) , Induction ( nullptr ) ,
2014-11-24 04:08:18 -05:00
WidestIndTy ( nullptr ) , HasFunNoNaNAttr ( false ) , MaxSafeDepDistBytes ( - 1U ) {
}
2012-12-02 08:10:19 -05:00
2013-04-08 14:41:23 -04:00
/// This enum represents the kinds of reductions that we support.
2012-12-02 08:10:19 -05:00
enum ReductionKind {
2013-04-08 14:41:23 -04:00
RK_NoReduction , ///< Not a reduction.
RK_IntegerAdd , ///< Sum of integers.
RK_IntegerMult , ///< Product of integers.
RK_IntegerOr , ///< Bitwise or logical OR of numbers.
RK_IntegerAnd , ///< Bitwise or logical AND of numbers.
RK_IntegerXor , ///< Bitwise or logical XOR of numbers.
2013-06-10 16:36:52 -04:00
RK_IntegerMinMax , ///< Min/max implemented in terms of select(cmp()).
2013-04-08 14:41:23 -04:00
RK_FloatAdd , ///< Sum of floats.
2013-06-10 16:36:52 -04:00
RK_FloatMult , ///< Product of floats.
RK_FloatMinMax ///< Min/max implemented in terms of select(cmp()).
2013-04-08 14:41:23 -04:00
} ;
/// This enum represents the kinds of inductions that we support.
enum InductionKind {
IK_NoInduction , ///< Not an induction variable.
IK_IntInduction , ///< Integer induction variable. Step = 1.
IK_ReverseIntInduction , ///< Reverse int induction variable. Step = -1.
IK_PtrInduction , ///< Pointer induction var. Step = sizeof(elem).
IK_ReversePtrInduction ///< Reverse ptr indvar. Step = - sizeof(elem).
2012-12-02 08:10:19 -05:00
} ;
2013-06-10 16:36:52 -04:00
// This enum represents the kind of minmax reduction.
enum MinMaxReductionKind {
MRK_Invalid ,
MRK_UIntMin ,
MRK_UIntMax ,
MRK_SIntMin ,
MRK_SIntMax ,
MRK_FloatMin ,
MRK_FloatMax
} ;
2013-12-21 19:04:03 -05:00
/// This struct holds information about reduction variables.
2012-12-02 08:10:19 -05:00
struct ReductionDescriptor {
2014-11-24 04:08:18 -05:00
ReductionDescriptor ( ) : StartValue ( nullptr ) , LoopExitInstr ( nullptr ) ,
2013-06-10 16:36:52 -04:00
Kind ( RK_NoReduction ) , MinMaxKind ( MRK_Invalid ) { }
2012-12-02 08:10:19 -05:00
2013-06-10 16:36:52 -04:00
ReductionDescriptor ( Value * Start , Instruction * Exit , ReductionKind K ,
MinMaxReductionKind MK )
: StartValue ( Start ) , LoopExitInstr ( Exit ) , Kind ( K ) , MinMaxKind ( MK ) { }
2012-12-02 08:10:19 -05:00
// The starting value of the reduction.
// It does not have to be zero!
2013-06-10 16:36:52 -04:00
TrackingVH < Value > StartValue ;
2012-12-02 08:10:19 -05:00
// The instruction who's value is used outside the loop.
Instruction * LoopExitInstr ;
// The kind of the reduction.
ReductionKind Kind ;
2013-06-10 16:36:52 -04:00
// If this a min/max reduction the kind of reduction.
MinMaxReductionKind MinMaxKind ;
} ;
/// This POD struct holds information about a potential reduction operation.
struct ReductionInstDesc {
ReductionInstDesc ( bool IsRedux , Instruction * I ) :
IsReduction ( IsRedux ) , PatternLastInst ( I ) , MinMaxKind ( MRK_Invalid ) { }
ReductionInstDesc ( Instruction * I , MinMaxReductionKind K ) :
IsReduction ( true ) , PatternLastInst ( I ) , MinMaxKind ( K ) { }
// Is this instruction a reduction candidate.
bool IsReduction ;
// The last instruction in a min/max pattern (select of the select(icmp())
// pattern), or the current reduction instruction otherwise.
Instruction * PatternLastInst ;
// If this is a min/max pattern the comparison predicate.
MinMaxReductionKind MinMaxKind ;
2012-12-02 08:10:19 -05:00
} ;
2013-12-21 19:04:03 -05:00
/// This struct holds information about the memory runtime legality
/// check that a group of pointers do not overlap.
2012-12-02 08:10:19 -05:00
struct RuntimePointerCheck {
2013-04-08 14:41:23 -04:00
RuntimePointerCheck ( ) : Need ( false ) { }
/// Reset the state of the pointer runtime information.
void reset ( ) {
Need = false ;
Pointers . clear ( ) ;
Starts . clear ( ) ;
Ends . clear ( ) ;
2013-12-21 19:04:03 -05:00
IsWritePtr . clear ( ) ;
DependencySetId . clear ( ) ;
2014-11-24 04:08:18 -05:00
AliasSetId . clear ( ) ;
2013-04-08 14:41:23 -04:00
}
/// Insert a pointer and calculate the start and end SCEVs.
2013-12-21 19:04:03 -05:00
void insert ( ScalarEvolution * SE , Loop * Lp , Value * Ptr , bool WritePtr ,
2014-11-24 04:08:18 -05:00
unsigned DepSetId , unsigned ASId , ValueToValueMap & Strides ) ;
2013-04-08 14:41:23 -04:00
2012-12-02 08:10:19 -05:00
/// This flag indicates if we need to add the runtime check.
bool Need ;
/// Holds the pointers that we need to check.
2013-06-10 16:36:52 -04:00
SmallVector < TrackingVH < Value > , 2 > Pointers ;
2013-04-08 14:41:23 -04:00
/// Holds the pointer value at the beginning of the loop.
SmallVector < const SCEV * , 2 > Starts ;
/// Holds the pointer value at the end of the loop.
SmallVector < const SCEV * , 2 > Ends ;
2013-06-10 16:36:52 -04:00
/// Holds the information if this pointer is used for writing to memory.
SmallVector < bool , 2 > IsWritePtr ;
2013-12-21 19:04:03 -05:00
/// Holds the id of the set of pointers that could be dependent because of a
/// shared underlying object.
SmallVector < unsigned , 2 > DependencySetId ;
2014-11-24 04:08:18 -05:00
/// Holds the id of the disjoint alias set to which this pointer belongs.
SmallVector < unsigned , 2 > AliasSetId ;
2013-04-08 14:41:23 -04:00
} ;
2013-12-21 19:04:03 -05:00
/// A struct for saving information about induction variables.
2013-04-08 14:41:23 -04:00
struct InductionInfo {
InductionInfo ( Value * Start , InductionKind K ) : StartValue ( Start ) , IK ( K ) { }
2014-11-24 04:08:18 -05:00
InductionInfo ( ) : StartValue ( nullptr ) , IK ( IK_NoInduction ) { }
2013-04-08 14:41:23 -04:00
/// Start value.
2013-06-10 16:36:52 -04:00
TrackingVH < Value > StartValue ;
2013-04-08 14:41:23 -04:00
/// Induction kind.
InductionKind IK ;
2012-12-02 08:10:19 -05:00
} ;
/// ReductionList contains the reduction descriptors for all
/// of the reductions that were found in the loop.
typedef DenseMap < PHINode * , ReductionDescriptor > ReductionList ;
2013-04-08 14:41:23 -04:00
/// InductionList saves induction variables and maps them to the
/// induction descriptor.
typedef MapVector < PHINode * , InductionInfo > InductionList ;
2012-12-02 08:10:19 -05:00
/// Returns true if it is legal to vectorize this loop.
/// This does not mean that it is profitable to vectorize this
/// loop, only that it is legal to do so.
bool canVectorize ( ) ;
/// Returns the Induction variable.
2013-04-08 14:41:23 -04:00
PHINode * getInduction ( ) { return Induction ; }
2012-12-02 08:10:19 -05:00
/// Returns the reduction variables found in the loop.
ReductionList * getReductionVars ( ) { return & Reductions ; }
2013-04-08 14:41:23 -04:00
/// Returns the induction variables found in the loop.
InductionList * getInductionVars ( ) { return & Inductions ; }
2013-12-21 19:04:03 -05:00
/// Returns the widest induction type.
Type * getWidestInductionType ( ) { return WidestIndTy ; }
2013-04-08 14:41:23 -04:00
/// Returns True if V is an induction variable in this loop.
bool isInductionVariable ( const Value * V ) ;
/// Return true if the block BB needs to be predicated in order for the loop
/// to be vectorized.
bool blockNeedsPredication ( BasicBlock * BB ) ;
/// Check if this pointer is consecutive when vectorizing. This happens
/// when the last index of the GEP is the induction variable, or that the
/// pointer itself is an induction variable.
2012-12-02 08:10:19 -05:00
/// This check allows us to vectorize A[idx] into a wide load/store.
2013-04-08 14:41:23 -04:00
/// Returns:
2014-11-24 04:08:18 -05:00
/// 0 - Stride is unknown or non-consecutive.
2013-04-08 14:41:23 -04:00
/// 1 - Address is consecutive.
/// -1 - Address is consecutive, and decreasing.
int isConsecutivePtr ( Value * Ptr ) ;
2012-12-02 08:10:19 -05:00
/// Returns true if the value V is uniform within the loop.
bool isUniform ( Value * V ) ;
/// Returns true if this instruction will remain scalar after vectorization.
2013-04-08 14:41:23 -04:00
bool isUniformAfterVectorization ( Instruction * I ) { return Uniforms . count ( I ) ; }
2012-12-02 08:10:19 -05:00
/// Returns the information that we collected about runtime memory check.
2013-04-08 14:41:23 -04:00
RuntimePointerCheck * getRuntimePointerCheck ( ) { return & PtrRtCheck ; }
2013-06-10 16:36:52 -04:00
/// This function returns the identity element (or neutral element) for
/// the operation K.
static Constant * getReductionIdentity ( ReductionKind K , Type * Tp ) ;
2013-12-21 19:04:03 -05:00
unsigned getMaxSafeDepDistBytes ( ) { return MaxSafeDepDistBytes ; }
2014-11-24 04:08:18 -05:00
bool hasStride ( Value * V ) { return StrideSet . count ( V ) ; }
bool mustCheckStrides ( ) { return ! StrideSet . empty ( ) ; }
SmallPtrSet < Value * , 8 > : : iterator strides_begin ( ) {
return StrideSet . begin ( ) ;
}
SmallPtrSet < Value * , 8 > : : iterator strides_end ( ) { return StrideSet . end ( ) ; }
2015-01-18 11:17:27 -05:00
/// Returns true if the target machine supports masked store operation
/// for the given \p DataType and kind of access to \p Ptr.
bool isLegalMaskedStore ( Type * DataType , Value * Ptr ) {
return TTI - > isLegalMaskedStore ( DataType , isConsecutivePtr ( Ptr ) ) ;
}
/// Returns true if the target machine supports masked load operation
/// for the given \p DataType and kind of access to \p Ptr.
bool isLegalMaskedLoad ( Type * DataType , Value * Ptr ) {
return TTI - > isLegalMaskedLoad ( DataType , isConsecutivePtr ( Ptr ) ) ;
}
/// Returns true if vector representation of the instruction \p I
/// requires mask.
bool isMaskRequired ( const Instruction * I ) {
return ( MaskedOp . count ( I ) ! = 0 ) ;
}
2012-12-02 08:10:19 -05:00
private :
/// Check if a single basic block loop is vectorizable.
/// At this point we know that this is a loop with a constant trip count
/// and we only need to check individual instructions.
2013-04-08 14:41:23 -04:00
bool canVectorizeInstrs ( ) ;
2012-12-02 08:10:19 -05:00
/// When we vectorize loops we may change the order in which
/// we read and write from memory. This method checks if it is
/// legal to vectorize the code, considering only memory constrains.
2013-04-08 14:41:23 -04:00
/// Returns true if the loop is vectorizable
bool canVectorizeMemory ( ) ;
/// Return true if we can vectorize this loop using the IF-conversion
/// transformation.
bool canVectorizeWithIfConvert ( ) ;
/// Collect the variables that need to stay uniform after vectorization.
void collectLoopUniforms ( ) ;
/// Return true if all of the instructions in the block can be speculatively
2013-12-21 19:04:03 -05:00
/// executed. \p SafePtrs is a list of addresses that are known to be legal
/// and we know that we can read from them without segfault.
2015-01-18 11:17:27 -05:00
bool blockCanBePredicated ( BasicBlock * BB , SmallPtrSetImpl < Value * > & SafePtrs ) ;
2012-12-02 08:10:19 -05:00
/// Returns True, if 'Phi' is the kind of reduction variable for type
/// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
bool AddReductionVar ( PHINode * Phi , ReductionKind Kind ) ;
2013-06-10 16:36:52 -04:00
/// Returns a struct describing if the instruction 'I' can be a reduction
/// variable of type 'Kind'. If the reduction is a min/max pattern of
/// select(icmp()) this function advances the instruction pointer 'I' from the
/// compare instruction to the select instruction and stores this pointer in
/// 'PatternLastInst' member of the returned struct.
ReductionInstDesc isReductionInstr ( Instruction * I , ReductionKind Kind ,
ReductionInstDesc & Desc ) ;
/// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction
/// pattern corresponding to a min(X, Y) or max(X, Y).
static ReductionInstDesc isMinMaxSelectCmpPattern ( Instruction * I ,
ReductionInstDesc & Prev ) ;
2013-04-08 14:41:23 -04:00
/// Returns the induction kind of Phi. This function may return NoInduction
/// if the PHI is not an induction variable.
InductionKind isInductionVariable ( PHINode * Phi ) ;
2012-12-02 08:10:19 -05:00
2014-11-24 04:08:18 -05:00
/// \brief Collect memory access with loop invariant strides.
///
/// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop
/// invariant.
2015-01-18 11:17:27 -05:00
void collectStridedAccess ( Value * LoadOrStoreInst ) ;
2014-11-24 04:08:18 -05:00
/// Report an analysis message to assist the user in diagnosing loops that are
/// not vectorized.
void emitAnalysis ( Report & Message ) {
DebugLoc DL = TheLoop - > getStartLoc ( ) ;
if ( Instruction * I = Message . getInstr ( ) )
DL = I - > getDebugLoc ( ) ;
emitOptimizationRemarkAnalysis ( TheFunction - > getContext ( ) , DEBUG_TYPE ,
* TheFunction , DL , Message . str ( ) ) ;
}
2012-12-02 08:10:19 -05:00
/// The loop that we evaluate.
Loop * TheLoop ;
/// Scev analysis.
ScalarEvolution * SE ;
/// DataLayout analysis.
2014-11-24 04:08:18 -05:00
const DataLayout * DL ;
2013-04-08 14:41:23 -04:00
/// Dominators.
DominatorTree * DT ;
/// Target Library Info.
TargetLibraryInfo * TLI ;
2014-11-24 04:08:18 -05:00
/// Alias analysis.
AliasAnalysis * AA ;
/// Parent function
Function * TheFunction ;
2015-01-18 11:17:27 -05:00
/// Target Transform Info
const TargetTransformInfo * TTI ;
2012-12-02 08:10:19 -05:00
// --- vectorization state --- //
2013-04-08 14:41:23 -04:00
/// Holds the integer induction variable. This is the counter of the
/// loop.
2012-12-02 08:10:19 -05:00
PHINode * Induction ;
/// Holds the reduction variables.
ReductionList Reductions ;
2013-04-08 14:41:23 -04:00
/// Holds all of the induction variables that we found in the loop.
/// Notice that inductions don't need to start at zero and that induction
/// variables can be pointers.
InductionList Inductions ;
2013-12-21 19:04:03 -05:00
/// Holds the widest induction type encountered.
Type * WidestIndTy ;
2013-04-08 14:41:23 -04:00
2012-12-02 08:10:19 -05:00
/// Allowed outside users. This holds the reduction
/// vars which can be accessed from outside the loop.
SmallPtrSet < Value * , 4 > AllowedExit ;
/// This set holds the variables which are known to be uniform after
/// vectorization.
SmallPtrSet < Instruction * , 4 > Uniforms ;
/// We need to check that all of the pointers in this list are disjoint
/// at runtime.
RuntimePointerCheck PtrRtCheck ;
2013-06-10 16:36:52 -04:00
/// Can we assume the absence of NaNs.
bool HasFunNoNaNAttr ;
2013-12-21 19:04:03 -05:00
unsigned MaxSafeDepDistBytes ;
2014-11-24 04:08:18 -05:00
ValueToValueMap Strides ;
SmallPtrSet < Value * , 8 > StrideSet ;
2015-01-18 11:17:27 -05:00
/// While vectorizing these instructions we have to generate a
/// call to the appropriate masked intrinsic
SmallPtrSet < const Instruction * , 8 > MaskedOp ;
2012-12-02 08:10:19 -05:00
} ;
/// LoopVectorizationCostModel - estimates the expected speedups due to
/// vectorization.
2013-04-08 14:41:23 -04:00
/// In many cases vectorization is not profitable. This can happen because of
/// a number of reasons. In this class we mainly attempt to predict the
/// expected speedup/slowdowns due to the supported instruction set. We use the
/// TargetTransformInfo to query the different backends for the cost of
/// different operations.
2012-12-02 08:10:19 -05:00
class LoopVectorizationCostModel {
public :
2013-04-08 14:41:23 -04:00
LoopVectorizationCostModel ( Loop * L , ScalarEvolution * SE , LoopInfo * LI ,
LoopVectorizationLegality * Legal ,
const TargetTransformInfo & TTI ,
2015-01-18 11:17:27 -05:00
const DataLayout * DL , const TargetLibraryInfo * TLI ,
AssumptionCache * AC , const Function * F ,
const LoopVectorizeHints * Hints )
: TheLoop ( L ) , SE ( SE ) , LI ( LI ) , Legal ( Legal ) , TTI ( TTI ) , DL ( DL ) , TLI ( TLI ) ,
TheFunction ( F ) , Hints ( Hints ) {
CodeMetrics : : collectEphemeralValues ( L , AC , EphValues ) ;
}
2013-04-08 14:41:23 -04:00
/// Information about vectorization costs
struct VectorizationFactor {
unsigned Width ; // Vector width with best cost
unsigned Cost ; // Cost of the loop with that width
} ;
/// \return The most profitable vectorization factor and the cost of that VF.
/// This method checks every power of two up to VF. If UserVF is not ZERO
/// then this vectorization factor will be selected if vectorization is
/// possible.
2015-01-18 11:17:27 -05:00
VectorizationFactor selectVectorizationFactor ( bool OptForSize ) ;
2013-04-08 14:41:23 -04:00
/// \return The size (in bits) of the widest type in the code that
/// needs to be vectorized. We ignore values that remain scalar such as
/// 64 bit loop indices.
unsigned getWidestType ( ) ;
/// \return The most profitable unroll factor.
/// If UserUF is non-zero then this method finds the best unroll-factor
/// based on register pressure and other parameters.
/// VF and LoopCost are the selected vectorization factor and the cost of the
/// selected VF.
2015-01-18 11:17:27 -05:00
unsigned selectUnrollFactor ( bool OptForSize , unsigned VF , unsigned LoopCost ) ;
2013-04-08 14:41:23 -04:00
/// \brief A struct that represents some properties of the register usage
/// of a loop.
struct RegisterUsage {
/// Holds the number of loop invariant values that are used in the loop.
unsigned LoopInvariantRegs ;
/// Holds the maximum number of concurrent live intervals in the loop.
unsigned MaxLocalUsers ;
/// Holds the number of instructions in the loop.
unsigned NumInstructions ;
} ;
2012-12-02 08:10:19 -05:00
2013-04-08 14:41:23 -04:00
/// \return information about the register usage of the loop.
RegisterUsage calculateRegisterUsage ( ) ;
2012-12-02 08:10:19 -05:00
private :
/// Returns the expected execution cost. The unit of the cost does
/// not matter because we use the 'cost' units to compare different
/// vector widths. The cost that is returned is *not* normalized by
/// the factor width.
unsigned expectedCost ( unsigned VF ) ;
/// Returns the execution time cost of an instruction for a given vector
/// width. Vector width of one means scalar.
unsigned getInstructionCost ( Instruction * I , unsigned VF ) ;
/// A helper function for converting Scalar types to vector types.
/// If the incoming type is void, we return void. If the VF is 1, we return
/// the scalar type.
static Type * ToVectorTy ( Type * Scalar , unsigned VF ) ;
2013-04-08 14:41:23 -04:00
/// Returns whether the instruction is a load or store and will be a emitted
/// as a vector operation.
bool isConsecutiveLoadOrStore ( Instruction * I ) ;
2015-01-18 11:17:27 -05:00
/// Report an analysis message to assist the user in diagnosing loops that are
/// not vectorized.
void emitAnalysis ( Report & Message ) {
DebugLoc DL = TheLoop - > getStartLoc ( ) ;
if ( Instruction * I = Message . getInstr ( ) )
DL = I - > getDebugLoc ( ) ;
emitOptimizationRemarkAnalysis ( TheFunction - > getContext ( ) , DEBUG_TYPE ,
* TheFunction , DL , Message . str ( ) ) ;
}
/// Values used only by @llvm.assume calls.
SmallPtrSet < const Value * , 32 > EphValues ;
2012-12-02 08:10:19 -05:00
/// The loop that we evaluate.
Loop * TheLoop ;
/// Scev analysis.
ScalarEvolution * SE ;
2013-04-08 14:41:23 -04:00
/// Loop Info analysis.
LoopInfo * LI ;
2012-12-02 08:10:19 -05:00
/// Vectorization legality.
LoopVectorizationLegality * Legal ;
/// Vector target information.
2013-04-08 14:41:23 -04:00
const TargetTransformInfo & TTI ;
/// Target data layout information.
2014-11-24 04:08:18 -05:00
const DataLayout * DL ;
2013-04-08 14:41:23 -04:00
/// Target Library Info.
const TargetLibraryInfo * TLI ;
2015-01-18 11:17:27 -05:00
const Function * TheFunction ;
// Loop Vectorize Hint.
const LoopVectorizeHints * Hints ;
2012-12-02 08:10:19 -05:00
} ;
2013-12-21 19:04:03 -05:00
/// Utility class for getting and setting loop vectorizer hints in the form
/// of loop metadata.
2015-01-18 11:17:27 -05:00
/// This class keeps a number of loop annotations locally (as member variables)
/// and can, upon request, write them back as metadata on the loop. It will
/// initially scan the loop for existing metadata, and will update the local
/// values based on information in the loop.
/// We cannot write all values to metadata, as the mere presence of some info,
/// for example 'force', means a decision has been made. So, we need to be
/// careful NOT to add them if the user hasn't specifically asked so.
2014-11-24 04:08:18 -05:00
class LoopVectorizeHints {
2015-01-18 11:17:27 -05:00
enum HintKind {
HK_WIDTH ,
HK_UNROLL ,
HK_FORCE
} ;
/// Hint - associates name and validation with the hint value.
struct Hint {
const char * Name ;
unsigned Value ; // This may have to change for non-numeric values.
HintKind Kind ;
Hint ( const char * Name , unsigned Value , HintKind Kind )
: Name ( Name ) , Value ( Value ) , Kind ( Kind ) { }
bool validate ( unsigned Val ) {
switch ( Kind ) {
case HK_WIDTH :
return isPowerOf2_32 ( Val ) & & Val < = MaxVectorWidth ;
case HK_UNROLL :
return isPowerOf2_32 ( Val ) & & Val < = MaxInterleaveFactor ;
case HK_FORCE :
return ( Val < = 1 ) ;
}
return false ;
}
} ;
/// Vectorization width.
Hint Width ;
/// Vectorization interleave factor.
Hint Interleave ;
/// Vectorization forced
Hint Force ;
/// Return the loop metadata prefix.
static StringRef Prefix ( ) { return " llvm.loop. " ; }
2014-11-24 04:08:18 -05:00
public :
enum ForceKind {
FK_Undefined = - 1 , ///< Not selected.
FK_Disabled = 0 , ///< Forcing disabled.
FK_Enabled = 1 , ///< Forcing enabled.
} ;
2013-12-21 19:04:03 -05:00
2015-01-18 11:17:27 -05:00
LoopVectorizeHints ( const Loop * L , bool DisableInterleaving )
: Width ( " vectorize.width " , VectorizationFactor , HK_WIDTH ) ,
Interleave ( " interleave.count " , DisableInterleaving , HK_UNROLL ) ,
Force ( " vectorize.enable " , FK_Undefined , HK_FORCE ) ,
TheLoop ( L ) {
// Populate values with existing loop metadata.
getHintsFromMetadata ( ) ;
2013-12-21 19:04:03 -05:00
2015-01-18 11:17:27 -05:00
// force-vector-interleave overrides DisableInterleaving.
if ( VectorizationInterleave . getNumOccurrences ( ) > 0 )
Interleave . Value = VectorizationInterleave ;
2013-12-21 19:04:03 -05:00
2015-01-18 11:17:27 -05:00
DEBUG ( if ( DisableInterleaving & & Interleave . Value = = 1 ) dbgs ( )
< < " LV: Interleaving disabled by the pass manager \n " ) ;
2013-12-21 19:04:03 -05:00
}
/// Mark the loop L as already vectorized by setting the width to 1.
2015-01-18 11:17:27 -05:00
void setAlreadyVectorized ( ) {
Width . Value = Interleave . Value = 1 ;
Hint Hints [ ] = { Width , Interleave } ;
writeHintsToMetadata ( Hints ) ;
2013-12-21 19:04:03 -05:00
}
2015-01-18 11:17:27 -05:00
/// Dumps all the hint information.
2014-11-24 04:08:18 -05:00
std : : string emitRemark ( ) const {
Report R ;
2015-01-18 11:17:27 -05:00
if ( Force . Value = = LoopVectorizeHints : : FK_Disabled )
R < < " vectorization is explicitly disabled " ;
else {
R < < " use -Rpass-analysis=loop-vectorize for more info " ;
if ( Force . Value = = LoopVectorizeHints : : FK_Enabled ) {
R < < " (Force=true " ;
if ( Width . Value ! = 0 )
R < < " , Vector Width= " < < Width . Value ;
if ( Interleave . Value ! = 0 )
R < < " , Interleave Count= " < < Interleave . Value ;
R < < " ) " ;
}
2014-11-24 04:08:18 -05:00
}
2015-01-18 11:17:27 -05:00
2014-11-24 04:08:18 -05:00
return R . str ( ) ;
}
2013-12-21 19:04:03 -05:00
2015-01-18 11:17:27 -05:00
unsigned getWidth ( ) const { return Width . Value ; }
unsigned getInterleave ( ) const { return Interleave . Value ; }
enum ForceKind getForce ( ) const { return ( ForceKind ) Force . Value ; }
2014-11-24 04:08:18 -05:00
private :
2015-01-18 11:17:27 -05:00
/// Find hints specified in the loop metadata and update local values.
void getHintsFromMetadata ( ) {
MDNode * LoopID = TheLoop - > getLoopID ( ) ;
2013-12-21 19:04:03 -05:00
if ( ! LoopID )
return ;
// First operand should refer to the loop id itself.
assert ( LoopID - > getNumOperands ( ) > 0 & & " requires at least one operand " ) ;
assert ( LoopID - > getOperand ( 0 ) = = LoopID & & " invalid loop id " ) ;
for ( unsigned i = 1 , ie = LoopID - > getNumOperands ( ) ; i < ie ; + + i ) {
2014-11-24 04:08:18 -05:00
const MDString * S = nullptr ;
2015-01-18 11:17:27 -05:00
SmallVector < Metadata * , 4 > Args ;
2013-12-21 19:04:03 -05:00
// The expected hint is either a MDString or a MDNode with the first
// operand a MDString.
if ( const MDNode * MD = dyn_cast < MDNode > ( LoopID - > getOperand ( i ) ) ) {
if ( ! MD | | MD - > getNumOperands ( ) = = 0 )
continue ;
S = dyn_cast < MDString > ( MD - > getOperand ( 0 ) ) ;
for ( unsigned i = 1 , ie = MD - > getNumOperands ( ) ; i < ie ; + + i )
Args . push_back ( MD - > getOperand ( i ) ) ;
} else {
S = dyn_cast < MDString > ( LoopID - > getOperand ( i ) ) ;
assert ( Args . size ( ) = = 0 & & " too many arguments for MDString " ) ;
}
if ( ! S )
continue ;
2014-11-24 04:08:18 -05:00
// Check if the hint starts with the loop metadata prefix.
2015-01-18 11:17:27 -05:00
StringRef Name = S - > getString ( ) ;
2013-12-21 19:04:03 -05:00
if ( Args . size ( ) = = 1 )
2015-01-18 11:17:27 -05:00
setHint ( Name , Args [ 0 ] ) ;
2013-12-21 19:04:03 -05:00
}
}
2015-01-18 11:17:27 -05:00
/// Checks string hint with one operand and set value if valid.
void setHint ( StringRef Name , Metadata * Arg ) {
if ( ! Name . startswith ( Prefix ( ) ) )
return ;
Name = Name . substr ( Prefix ( ) . size ( ) , StringRef : : npos ) ;
const ConstantInt * C = mdconst : : dyn_extract < ConstantInt > ( Arg ) ;
2013-12-21 19:04:03 -05:00
if ( ! C ) return ;
unsigned Val = C - > getZExtValue ( ) ;
2015-01-18 11:17:27 -05:00
Hint * Hints [ ] = { & Width , & Interleave , & Force } ;
for ( auto H : Hints ) {
if ( Name = = H - > Name ) {
if ( H - > validate ( Val ) )
H - > Value = Val ;
else
DEBUG ( dbgs ( ) < < " LV: ignoring invalid hint ' " < < Name < < " ' \n " ) ;
break ;
}
2013-12-21 19:04:03 -05:00
}
}
2014-11-24 04:08:18 -05:00
2015-01-18 11:17:27 -05:00
/// Create a new hint from name / value pair.
MDNode * createHintMetadata ( StringRef Name , unsigned V ) const {
LLVMContext & Context = TheLoop - > getHeader ( ) - > getContext ( ) ;
Metadata * MDs [ ] = { MDString : : get ( Context , Name ) ,
ConstantAsMetadata : : get (
ConstantInt : : get ( Type : : getInt32Ty ( Context ) , V ) ) } ;
return MDNode : : get ( Context , MDs ) ;
}
/// Matches metadata with hint name.
bool matchesHintMetadataName ( MDNode * Node , ArrayRef < Hint > HintTypes ) {
MDString * Name = dyn_cast < MDString > ( Node - > getOperand ( 0 ) ) ;
if ( ! Name )
return false ;
2014-11-24 04:08:18 -05:00
2015-01-18 11:17:27 -05:00
for ( auto H : HintTypes )
if ( Name - > getString ( ) . endswith ( H . Name ) )
return true ;
return false ;
}
/// Sets current hints into loop metadata, keeping other values intact.
void writeHintsToMetadata ( ArrayRef < Hint > HintTypes ) {
if ( HintTypes . size ( ) = = 0 )
return ;
// Reserve the first element to LoopID (see below).
SmallVector < Metadata * , 4 > MDs ( 1 ) ;
// If the loop already has metadata, then ignore the existing operands.
MDNode * LoopID = TheLoop - > getLoopID ( ) ;
if ( LoopID ) {
for ( unsigned i = 1 , ie = LoopID - > getNumOperands ( ) ; i < ie ; + + i ) {
MDNode * Node = cast < MDNode > ( LoopID - > getOperand ( i ) ) ;
// If node in update list, ignore old value.
if ( ! matchesHintMetadataName ( Node , HintTypes ) )
MDs . push_back ( Node ) ;
}
}
// Now, add the missing hints.
for ( auto H : HintTypes )
MDs . push_back ( createHintMetadata ( Twine ( Prefix ( ) , H . Name ) . str ( ) , H . Value ) ) ;
// Replace current metadata node with new one.
LLVMContext & Context = TheLoop - > getHeader ( ) - > getContext ( ) ;
MDNode * NewLoopID = MDNode : : get ( Context , MDs ) ;
// Set operand 0 to refer to the loop id itself.
NewLoopID - > replaceOperandWith ( 0 , NewLoopID ) ;
TheLoop - > setLoopID ( NewLoopID ) ;
}
/// The loop these hints belong to.
const Loop * TheLoop ;
2013-12-21 19:04:03 -05:00
} ;
2014-11-24 04:08:18 -05:00
static void emitMissedWarning ( Function * F , Loop * L ,
const LoopVectorizeHints & LH ) {
emitOptimizationRemarkMissed ( F - > getContext ( ) , DEBUG_TYPE , * F ,
L - > getStartLoc ( ) , LH . emitRemark ( ) ) ;
if ( LH . getForce ( ) = = LoopVectorizeHints : : FK_Enabled ) {
if ( LH . getWidth ( ) ! = 1 )
emitLoopVectorizeWarning (
F - > getContext ( ) , * F , L - > getStartLoc ( ) ,
" failed explicitly specified loop vectorization " ) ;
2015-01-18 11:17:27 -05:00
else if ( LH . getInterleave ( ) ! = 1 )
2014-11-24 04:08:18 -05:00
emitLoopInterleaveWarning (
F - > getContext ( ) , * F , L - > getStartLoc ( ) ,
" failed explicitly specified loop interleaving " ) ;
}
}
static void addInnerLoop ( Loop & L , SmallVectorImpl < Loop * > & V ) {
if ( L . empty ( ) )
return V . push_back ( & L ) ;
for ( Loop * InnerL : L )
addInnerLoop ( * InnerL , V ) ;
}
2013-04-08 14:41:23 -04:00
/// The LoopVectorize Pass.
2014-11-24 04:08:18 -05:00
struct LoopVectorize : public FunctionPass {
2013-04-08 14:41:23 -04:00
/// Pass identification, replacement for typeid
static char ID ;
2012-12-02 08:10:19 -05:00
2014-11-24 04:08:18 -05:00
explicit LoopVectorize ( bool NoUnrolling = false , bool AlwaysVectorize = true )
: FunctionPass ( ID ) ,
DisableUnrolling ( NoUnrolling ) ,
AlwaysVectorize ( AlwaysVectorize ) {
2012-12-02 08:10:19 -05:00
initializeLoopVectorizePass ( * PassRegistry : : getPassRegistry ( ) ) ;
}
ScalarEvolution * SE ;
2014-11-24 04:08:18 -05:00
const DataLayout * DL ;
2012-12-02 08:10:19 -05:00
LoopInfo * LI ;
TargetTransformInfo * TTI ;
DominatorTree * DT ;
2014-11-24 04:08:18 -05:00
BlockFrequencyInfo * BFI ;
2013-04-08 14:41:23 -04:00
TargetLibraryInfo * TLI ;
2014-11-24 04:08:18 -05:00
AliasAnalysis * AA ;
2015-01-18 11:17:27 -05:00
AssumptionCache * AC ;
2013-12-21 19:04:03 -05:00
bool DisableUnrolling ;
2014-11-24 04:08:18 -05:00
bool AlwaysVectorize ;
2012-12-02 08:10:19 -05:00
2014-11-24 04:08:18 -05:00
BlockFrequency ColdEntryFreq ;
2012-12-02 08:10:19 -05:00
2014-11-24 04:08:18 -05:00
bool runOnFunction ( Function & F ) override {
2012-12-02 08:10:19 -05:00
SE = & getAnalysis < ScalarEvolution > ( ) ;
2014-11-24 04:08:18 -05:00
DataLayoutPass * DLP = getAnalysisIfAvailable < DataLayoutPass > ( ) ;
DL = DLP ? & DLP - > getDataLayout ( ) : nullptr ;
2012-12-02 08:10:19 -05:00
LI = & getAnalysis < LoopInfo > ( ) ;
2013-04-08 14:41:23 -04:00
TTI = & getAnalysis < TargetTransformInfo > ( ) ;
2014-11-24 04:08:18 -05:00
DT = & getAnalysis < DominatorTreeWrapperPass > ( ) . getDomTree ( ) ;
BFI = & getAnalysis < BlockFrequencyInfo > ( ) ;
2013-04-08 14:41:23 -04:00
TLI = getAnalysisIfAvailable < TargetLibraryInfo > ( ) ;
2014-11-24 04:08:18 -05:00
AA = & getAnalysis < AliasAnalysis > ( ) ;
2015-01-18 11:17:27 -05:00
AC = & getAnalysis < AssumptionCacheTracker > ( ) . getAssumptionCache ( F ) ;
2014-11-24 04:08:18 -05:00
// Compute some weights outside of the loop over the loops. Compute this
// using a BranchProbability to re-use its scaling math.
const BranchProbability ColdProb ( 1 , 5 ) ; // 20%
ColdEntryFreq = BlockFrequency ( BFI - > getEntryFreq ( ) ) * ColdProb ;
2012-12-02 08:10:19 -05:00
2013-12-21 19:04:03 -05:00
// If the target claims to have no vector registers don't attempt
// vectorization.
if ( ! TTI - > getNumberOfRegisters ( true ) )
return false ;
2014-11-24 04:08:18 -05:00
if ( ! DL ) {
DEBUG ( dbgs ( ) < < " \n LV: Not vectorizing " < < F . getName ( )
< < " : Missing data layout \n " ) ;
2013-06-10 16:36:52 -04:00
return false ;
}
2014-11-24 04:08:18 -05:00
// Build up a worklist of inner-loops to vectorize. This is necessary as
// the act of vectorizing or partially unrolling a loop creates new loops
// and can invalidate iterators across the loops.
SmallVector < Loop * , 8 > Worklist ;
for ( Loop * L : * LI )
addInnerLoop ( * L , Worklist ) ;
LoopsAnalyzed + = Worklist . size ( ) ;
// Now walk the identified inner loops.
bool Changed = false ;
while ( ! Worklist . empty ( ) )
Changed | = processLoop ( Worklist . pop_back_val ( ) ) ;
// Process each loop nest in the function.
return Changed ;
}
bool processLoop ( Loop * L ) {
assert ( L - > empty ( ) & & " Only process inner loops. " ) ;
# ifndef NDEBUG
const std : : string DebugLocStr = getDebugLocString ( L ) ;
# endif /* NDEBUG */
DEBUG ( dbgs ( ) < < " \n LV: Checking a loop in \" "
< < L - > getHeader ( ) - > getParent ( ) - > getName ( ) < < " \" from "
< < DebugLocStr < < " \n " ) ;
2012-12-02 08:10:19 -05:00
2013-12-21 19:04:03 -05:00
LoopVectorizeHints Hints ( L , DisableUnrolling ) ;
2014-11-24 04:08:18 -05:00
DEBUG ( dbgs ( ) < < " LV: Loop hints: "
< < " force= "
< < ( Hints . getForce ( ) = = LoopVectorizeHints : : FK_Disabled
? " disabled "
: ( Hints . getForce ( ) = = LoopVectorizeHints : : FK_Enabled
? " enabled "
: " ? " ) ) < < " width= " < < Hints . getWidth ( )
2015-01-18 11:17:27 -05:00
< < " unroll= " < < Hints . getInterleave ( ) < < " \n " ) ;
2014-11-24 04:08:18 -05:00
// Function containing loop
Function * F = L - > getHeader ( ) - > getParent ( ) ;
// Looking at the diagnostic output is the only way to determine if a loop
// was vectorized (other than looking at the IR or machine code), so it
// is important to generate an optimization remark for each loop. Most of
// these messages are generated by emitOptimizationRemarkAnalysis. Remarks
// generated by emitOptimizationRemark and emitOptimizationRemarkMissed are
// less verbose reporting vectorized loops and unvectorized loops that may
// benefit from vectorization, respectively.
if ( Hints . getForce ( ) = = LoopVectorizeHints : : FK_Disabled ) {
DEBUG ( dbgs ( ) < < " LV: Not vectorizing: #pragma vectorize disable. \n " ) ;
emitOptimizationRemarkAnalysis ( F - > getContext ( ) , DEBUG_TYPE , * F ,
L - > getStartLoc ( ) , Hints . emitRemark ( ) ) ;
2013-12-21 19:04:03 -05:00
return false ;
}
2014-11-24 04:08:18 -05:00
if ( ! AlwaysVectorize & & Hints . getForce ( ) ! = LoopVectorizeHints : : FK_Enabled ) {
DEBUG ( dbgs ( ) < < " LV: Not vectorizing: No #pragma vectorize enable. \n " ) ;
emitOptimizationRemarkAnalysis ( F - > getContext ( ) , DEBUG_TYPE , * F ,
L - > getStartLoc ( ) , Hints . emitRemark ( ) ) ;
return false ;
}
2015-01-18 11:17:27 -05:00
if ( Hints . getWidth ( ) = = 1 & & Hints . getInterleave ( ) = = 1 ) {
2014-11-24 04:08:18 -05:00
DEBUG ( dbgs ( ) < < " LV: Not vectorizing: Disabled/already vectorized. \n " ) ;
emitOptimizationRemarkAnalysis (
F - > getContext ( ) , DEBUG_TYPE , * F , L - > getStartLoc ( ) ,
" loop not vectorized: vector width and interleave count are "
" explicitly set to 1 " ) ;
return false ;
}
// Check the loop for a trip count threshold:
// do not vectorize loops with a tiny trip count.
2015-01-18 11:17:27 -05:00
const unsigned TC = SE - > getSmallConstantTripCount ( L ) ;
2014-11-24 04:08:18 -05:00
if ( TC > 0u & & TC < TinyTripCountVectorThreshold ) {
DEBUG ( dbgs ( ) < < " LV: Found a loop with a very small trip count. "
< < " This loop is not worth vectorizing. " ) ;
if ( Hints . getForce ( ) = = LoopVectorizeHints : : FK_Enabled )
DEBUG ( dbgs ( ) < < " But vectorizing was explicitly forced. \n " ) ;
else {
DEBUG ( dbgs ( ) < < " \n " ) ;
emitOptimizationRemarkAnalysis (
F - > getContext ( ) , DEBUG_TYPE , * F , L - > getStartLoc ( ) ,
" vectorization is not beneficial and is not explicitly forced " ) ;
return false ;
}
}
2012-12-02 08:10:19 -05:00
// Check if it is legal to vectorize the loop.
2015-01-18 11:17:27 -05:00
LoopVectorizationLegality LVL ( L , SE , DL , DT , TLI , AA , F , TTI ) ;
2012-12-02 08:10:19 -05:00
if ( ! LVL . canVectorize ( ) ) {
2014-11-24 04:08:18 -05:00
DEBUG ( dbgs ( ) < < " LV: Not vectorizing: Cannot prove legality. \n " ) ;
emitMissedWarning ( F , L , Hints ) ;
2012-12-02 08:10:19 -05:00
return false ;
}
2013-04-08 14:41:23 -04:00
// Use the cost model.
2015-01-18 11:17:27 -05:00
LoopVectorizationCostModel CM ( L , SE , LI , & LVL , * TTI , DL , TLI , AC , F ,
& Hints ) ;
2013-04-08 14:41:23 -04:00
// Check the function attributes to find out if this function should be
// optimized for size.
2014-11-24 04:08:18 -05:00
bool OptForSize = Hints . getForce ( ) ! = LoopVectorizeHints : : FK_Enabled & &
F - > hasFnAttribute ( Attribute : : OptimizeForSize ) ;
// Compute the weighted frequency of this loop being executed and see if it
// is less than 20% of the function entry baseline frequency. Note that we
// always have a canonical loop here because we think we *can* vectoriez.
// FIXME: This is hidden behind a flag due to pervasive problems with
// exactly what block frequency models.
if ( LoopVectorizeWithBlockFrequency ) {
BlockFrequency LoopEntryFreq = BFI - > getBlockFreq ( L - > getLoopPreheader ( ) ) ;
if ( Hints . getForce ( ) ! = LoopVectorizeHints : : FK_Enabled & &
LoopEntryFreq < ColdEntryFreq )
OptForSize = true ;
}
2013-04-08 14:41:23 -04:00
2014-11-24 04:08:18 -05:00
// Check the function attributes to see if implicit floats are allowed.a
// FIXME: This check doesn't seem possibly correct -- what if the loop is
// an integer loop and the vector instructions selected are purely integer
// vector instructions?
if ( F - > hasFnAttribute ( Attribute : : NoImplicitFloat ) ) {
2013-04-08 14:41:23 -04:00
DEBUG ( dbgs ( ) < < " LV: Can't vectorize when the NoImplicitFloat "
" attribute is used. \n " ) ;
2014-11-24 04:08:18 -05:00
emitOptimizationRemarkAnalysis (
F - > getContext ( ) , DEBUG_TYPE , * F , L - > getStartLoc ( ) ,
" loop not vectorized due to NoImplicitFloat attribute " ) ;
emitMissedWarning ( F , L , Hints ) ;
2013-04-08 14:41:23 -04:00
return false ;
}
2012-12-02 08:10:19 -05:00
2013-04-08 14:41:23 -04:00
// Select the optimal vectorization factor.
2014-11-24 04:08:18 -05:00
const LoopVectorizationCostModel : : VectorizationFactor VF =
2015-01-18 11:17:27 -05:00
CM . selectVectorizationFactor ( OptForSize ) ;
2014-11-24 04:08:18 -05:00
2013-04-08 14:41:23 -04:00
// Select the unroll factor.
2014-11-24 04:08:18 -05:00
const unsigned UF =
2015-01-18 11:17:27 -05:00
CM . selectUnrollFactor ( OptForSize , VF . Width , VF . Cost ) ;
2013-12-21 19:04:03 -05:00
2014-11-24 04:08:18 -05:00
DEBUG ( dbgs ( ) < < " LV: Found a vectorizable loop ( " < < VF . Width < < " ) in "
< < DebugLocStr < < ' \n ' ) ;
2013-12-21 19:04:03 -05:00
DEBUG ( dbgs ( ) < < " LV: Unroll Factor is " < < UF < < ' \n ' ) ;
2013-04-08 14:41:23 -04:00
if ( VF . Width = = 1 ) {
2014-11-24 04:08:18 -05:00
DEBUG ( dbgs ( ) < < " LV: Vectorization is possible but not beneficial \n " ) ;
if ( UF = = 1 ) {
emitOptimizationRemarkAnalysis (
F - > getContext ( ) , DEBUG_TYPE , * F , L - > getStartLoc ( ) ,
" not beneficial to vectorize and user disabled interleaving " ) ;
2013-12-21 19:04:03 -05:00
return false ;
2014-11-24 04:08:18 -05:00
}
DEBUG ( dbgs ( ) < < " LV: Trying to at least unroll the loops. \n " ) ;
// Report the unrolling decision.
emitOptimizationRemark ( F - > getContext ( ) , DEBUG_TYPE , * F , L - > getStartLoc ( ) ,
Twine ( " unrolled with interleaving factor " +
Twine ( UF ) +
" (vectorization not beneficial) " ) ) ;
2013-12-21 19:04:03 -05:00
// We decided not to vectorize, but we may want to unroll.
2014-11-24 04:08:18 -05:00
2013-12-21 19:04:03 -05:00
InnerLoopUnroller Unroller ( L , SE , LI , DT , DL , TLI , UF ) ;
Unroller . vectorize ( & LVL ) ;
} else {
// If we decided that it is *legal* to vectorize the loop then do it.
InnerLoopVectorizer LB ( L , SE , LI , DT , DL , TLI , VF . Width , UF ) ;
LB . vectorize ( & LVL ) ;
2014-11-24 04:08:18 -05:00
+ + LoopsVectorized ;
// Report the vectorization decision.
emitOptimizationRemark (
F - > getContext ( ) , DEBUG_TYPE , * F , L - > getStartLoc ( ) ,
Twine ( " vectorized loop (vectorization factor: " ) + Twine ( VF . Width ) +
" , unrolling interleave factor: " + Twine ( UF ) + " ) " ) ;
2012-12-02 08:10:19 -05:00
}
2013-12-21 19:04:03 -05:00
// Mark the loop as already vectorized to avoid vectorizing again.
2015-01-18 11:17:27 -05:00
Hints . setAlreadyVectorized ( ) ;
2012-12-02 08:10:19 -05:00
DEBUG ( verifyFunction ( * L - > getHeader ( ) - > getParent ( ) ) ) ;
return true ;
}
2014-11-24 04:08:18 -05:00
void getAnalysisUsage ( AnalysisUsage & AU ) const override {
2015-01-18 11:17:27 -05:00
AU . addRequired < AssumptionCacheTracker > ( ) ;
2012-12-02 08:10:19 -05:00
AU . addRequiredID ( LoopSimplifyID ) ;
AU . addRequiredID ( LCSSAID ) ;
2014-11-24 04:08:18 -05:00
AU . addRequired < BlockFrequencyInfo > ( ) ;
AU . addRequired < DominatorTreeWrapperPass > ( ) ;
2012-12-02 08:10:19 -05:00
AU . addRequired < LoopInfo > ( ) ;
AU . addRequired < ScalarEvolution > ( ) ;
2013-04-08 14:41:23 -04:00
AU . addRequired < TargetTransformInfo > ( ) ;
2014-11-24 04:08:18 -05:00
AU . addRequired < AliasAnalysis > ( ) ;
2012-12-02 08:10:19 -05:00
AU . addPreserved < LoopInfo > ( ) ;
2014-11-24 04:08:18 -05:00
AU . addPreserved < DominatorTreeWrapperPass > ( ) ;
AU . addPreserved < AliasAnalysis > ( ) ;
2012-12-02 08:10:19 -05:00
}
} ;
2013-04-08 14:41:23 -04:00
} // end anonymous namespace
//===----------------------------------------------------------------------===//
// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
// LoopVectorizationCostModel.
//===----------------------------------------------------------------------===//
2014-11-24 04:08:18 -05:00
static Value * stripIntegerCast ( Value * V ) {
if ( CastInst * CI = dyn_cast < CastInst > ( V ) )
if ( CI - > getOperand ( 0 ) - > getType ( ) - > isIntegerTy ( ) )
return CI - > getOperand ( 0 ) ;
return V ;
}
///\brief Replaces the symbolic stride in a pointer SCEV expression by one.
///
/// If \p OrigPtr is not null, use it to look up the stride value instead of
/// \p Ptr.
static const SCEV * replaceSymbolicStrideSCEV ( ScalarEvolution * SE ,
ValueToValueMap & PtrToStride ,
Value * Ptr , Value * OrigPtr = nullptr ) {
const SCEV * OrigSCEV = SE - > getSCEV ( Ptr ) ;
// If there is an entry in the map return the SCEV of the pointer with the
// symbolic stride replaced by one.
ValueToValueMap : : iterator SI = PtrToStride . find ( OrigPtr ? OrigPtr : Ptr ) ;
if ( SI ! = PtrToStride . end ( ) ) {
Value * StrideVal = SI - > second ;
// Strip casts.
StrideVal = stripIntegerCast ( StrideVal ) ;
// Replace symbolic stride by one.
Value * One = ConstantInt : : get ( StrideVal - > getType ( ) , 1 ) ;
ValueToValueMap RewriteMap ;
RewriteMap [ StrideVal ] = One ;
const SCEV * ByOne =
SCEVParameterRewriter : : rewrite ( OrigSCEV , * SE , RewriteMap , true ) ;
DEBUG ( dbgs ( ) < < " LV: Replacing SCEV: " < < * OrigSCEV < < " by: " < < * ByOne
< < " \n " ) ;
return ByOne ;
}
// Otherwise, just return the SCEV of the original pointer.
return SE - > getSCEV ( Ptr ) ;
}
void LoopVectorizationLegality : : RuntimePointerCheck : : insert (
ScalarEvolution * SE , Loop * Lp , Value * Ptr , bool WritePtr , unsigned DepSetId ,
unsigned ASId , ValueToValueMap & Strides ) {
// Get the stride replaced scev.
const SCEV * Sc = replaceSymbolicStrideSCEV ( SE , Strides , Ptr ) ;
2013-04-08 14:41:23 -04:00
const SCEVAddRecExpr * AR = dyn_cast < SCEVAddRecExpr > ( Sc ) ;
assert ( AR & & " Invalid addrec expression " ) ;
2013-12-21 19:04:03 -05:00
const SCEV * Ex = SE - > getBackedgeTakenCount ( Lp ) ;
2013-04-08 14:41:23 -04:00
const SCEV * ScEnd = AR - > evaluateAtIteration ( Ex , * SE ) ;
Pointers . push_back ( Ptr ) ;
Starts . push_back ( AR - > getStart ( ) ) ;
Ends . push_back ( ScEnd ) ;
2013-06-10 16:36:52 -04:00
IsWritePtr . push_back ( WritePtr ) ;
2013-12-21 19:04:03 -05:00
DependencySetId . push_back ( DepSetId ) ;
2014-11-24 04:08:18 -05:00
AliasSetId . push_back ( ASId ) ;
2013-04-08 14:41:23 -04:00
}
Value * InnerLoopVectorizer : : getBroadcastInstrs ( Value * V ) {
// We need to place the broadcast of invariant variables outside the loop.
Instruction * Instr = dyn_cast < Instruction > ( V ) ;
2014-11-24 04:08:18 -05:00
bool NewInstr =
( Instr & & std : : find ( LoopVectorBody . begin ( ) , LoopVectorBody . end ( ) ,
Instr - > getParent ( ) ) ! = LoopVectorBody . end ( ) ) ;
2013-04-08 14:41:23 -04:00
bool Invariant = OrigLoop - > isLoopInvariant ( V ) & & ! NewInstr ;
// Place the code for broadcasting invariant variables in the new preheader.
2013-12-21 19:04:03 -05:00
IRBuilder < > : : InsertPointGuard Guard ( Builder ) ;
2013-04-08 14:41:23 -04:00
if ( Invariant )
Builder . SetInsertPoint ( LoopVectorPreHeader - > getTerminator ( ) ) ;
2012-12-02 08:10:19 -05:00
// Broadcast the scalar into all locations in the vector.
2013-04-08 14:41:23 -04:00
Value * Shuf = Builder . CreateVectorSplat ( VF , V , " broadcast " ) ;
2012-12-02 08:10:19 -05:00
return Shuf ;
}
2013-06-10 16:36:52 -04:00
Value * InnerLoopVectorizer : : getConsecutiveVector ( Value * Val , int StartIdx ,
2013-04-08 14:41:23 -04:00
bool Negate ) {
2012-12-02 08:10:19 -05:00
assert ( Val - > getType ( ) - > isVectorTy ( ) & & " Must be a vector " ) ;
assert ( Val - > getType ( ) - > getScalarType ( ) - > isIntegerTy ( ) & &
" Elem must be an integer " ) ;
// Create the types.
Type * ITy = Val - > getType ( ) - > getScalarType ( ) ;
VectorType * Ty = cast < VectorType > ( Val - > getType ( ) ) ;
2013-04-08 14:41:23 -04:00
int VLen = Ty - > getNumElements ( ) ;
2012-12-02 08:10:19 -05:00
SmallVector < Constant * , 8 > Indices ;
// Create a vector of consecutive numbers from zero to VF.
2013-04-08 14:41:23 -04:00
for ( int i = 0 ; i < VLen ; + + i ) {
2013-06-10 16:36:52 -04:00
int64_t Idx = Negate ? ( - i ) : i ;
Indices . push_back ( ConstantInt : : get ( ITy , StartIdx + Idx , Negate ) ) ;
2013-04-08 14:41:23 -04:00
}
2012-12-02 08:10:19 -05:00
// Add the consecutive indices to the vector value.
Constant * Cv = ConstantVector : : get ( Indices ) ;
assert ( Cv - > getType ( ) = = Val - > getType ( ) & & " Invalid consecutive vec " ) ;
return Builder . CreateAdd ( Val , Cv , " induction " ) ;
}
2013-12-21 19:04:03 -05:00
/// \brief Find the operand of the GEP that should be checked for consecutive
/// stores. This ignores trailing indices that have no effect on the final
/// pointer.
2014-11-24 04:08:18 -05:00
static unsigned getGEPInductionOperand ( const DataLayout * DL ,
2013-12-21 19:04:03 -05:00
const GetElementPtrInst * Gep ) {
unsigned LastOperand = Gep - > getNumOperands ( ) - 1 ;
unsigned GEPAllocSize = DL - > getTypeAllocSize (
cast < PointerType > ( Gep - > getType ( ) - > getScalarType ( ) ) - > getElementType ( ) ) ;
// Walk backwards and try to peel off zeros.
while ( LastOperand > 1 & & match ( Gep - > getOperand ( LastOperand ) , m_Zero ( ) ) ) {
// Find the type we're currently indexing into.
gep_type_iterator GEPTI = gep_type_begin ( Gep ) ;
std : : advance ( GEPTI , LastOperand - 1 ) ;
// If it's a type with the same allocation size as the result of the GEP we
// can peel off the zero index.
if ( DL - > getTypeAllocSize ( * GEPTI ) ! = GEPAllocSize )
break ;
- - LastOperand ;
}
return LastOperand ;
}
2013-04-08 14:41:23 -04:00
int LoopVectorizationLegality : : isConsecutivePtr ( Value * Ptr ) {
2014-11-24 04:08:18 -05:00
assert ( Ptr - > getType ( ) - > isPointerTy ( ) & & " Unexpected non-ptr " ) ;
2013-04-08 14:41:23 -04:00
// Make sure that the pointer does not point to structs.
2013-12-21 19:04:03 -05:00
if ( Ptr - > getType ( ) - > getPointerElementType ( ) - > isAggregateType ( ) )
2013-04-08 14:41:23 -04:00
return 0 ;
// If this value is a pointer induction variable we know it is consecutive.
PHINode * Phi = dyn_cast_or_null < PHINode > ( Ptr ) ;
if ( Phi & & Inductions . count ( Phi ) ) {
InductionInfo II = Inductions [ Phi ] ;
if ( IK_PtrInduction = = II . IK )
return 1 ;
else if ( IK_ReversePtrInduction = = II . IK )
return - 1 ;
}
2012-12-02 08:10:19 -05:00
GetElementPtrInst * Gep = dyn_cast_or_null < GetElementPtrInst > ( Ptr ) ;
if ( ! Gep )
2013-04-08 14:41:23 -04:00
return 0 ;
2012-12-02 08:10:19 -05:00
unsigned NumOperands = Gep - > getNumOperands ( ) ;
2013-04-08 14:41:23 -04:00
Value * GpPtr = Gep - > getPointerOperand ( ) ;
// If this GEP value is a consecutive pointer induction variable and all of
// the indices are constant then we know it is consecutive. We can
Phi = dyn_cast < PHINode > ( GpPtr ) ;
if ( Phi & & Inductions . count ( Phi ) ) {
// Make sure that the pointer does not point to structs.
PointerType * GepPtrType = cast < PointerType > ( GpPtr - > getType ( ) ) ;
if ( GepPtrType - > getElementType ( ) - > isAggregateType ( ) )
return 0 ;
// Make sure that all of the index operands are loop invariant.
for ( unsigned i = 1 ; i < NumOperands ; + + i )
if ( ! SE - > isLoopInvariant ( SE - > getSCEV ( Gep - > getOperand ( i ) ) , TheLoop ) )
return 0 ;
InductionInfo II = Inductions [ Phi ] ;
if ( IK_PtrInduction = = II . IK )
return 1 ;
else if ( IK_ReversePtrInduction = = II . IK )
return - 1 ;
}
2013-12-21 19:04:03 -05:00
unsigned InductionOperand = getGEPInductionOperand ( DL , Gep ) ;
// Check that all of the gep indices are uniform except for our induction
// operand.
for ( unsigned i = 0 ; i ! = NumOperands ; + + i )
if ( i ! = InductionOperand & &
! SE - > isLoopInvariant ( SE - > getSCEV ( Gep - > getOperand ( i ) ) , TheLoop ) )
2013-04-08 14:41:23 -04:00
return 0 ;
2012-12-02 08:10:19 -05:00
2013-12-21 19:04:03 -05:00
// We can emit wide load/stores only if the last non-zero index is the
// induction variable.
2014-11-24 04:08:18 -05:00
const SCEV * Last = nullptr ;
if ( ! Strides . count ( Gep ) )
Last = SE - > getSCEV ( Gep - > getOperand ( InductionOperand ) ) ;
else {
// Because of the multiplication by a stride we can have a s/zext cast.
// We are going to replace this stride by 1 so the cast is safe to ignore.
//
// %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
// %0 = trunc i64 %indvars.iv to i32
// %mul = mul i32 %0, %Stride1
// %idxprom = zext i32 %mul to i64 << Safe cast.
// %arrayidx = getelementptr inbounds i32* %B, i64 %idxprom
//
Last = replaceSymbolicStrideSCEV ( SE , Strides ,
Gep - > getOperand ( InductionOperand ) , Gep ) ;
if ( const SCEVCastExpr * C = dyn_cast < SCEVCastExpr > ( Last ) )
Last =
( C - > getSCEVType ( ) = = scSignExtend | | C - > getSCEVType ( ) = = scZeroExtend )
? C - > getOperand ( )
: Last ;
}
2012-12-02 08:10:19 -05:00
if ( const SCEVAddRecExpr * AR = dyn_cast < SCEVAddRecExpr > ( Last ) ) {
const SCEV * Step = AR - > getStepRecurrence ( * SE ) ;
// The memory is consecutive because the last index is consecutive
// and all other indices are loop invariant.
if ( Step - > isOne ( ) )
2013-04-08 14:41:23 -04:00
return 1 ;
if ( Step - > isAllOnesValue ( ) )
return - 1 ;
2012-12-02 08:10:19 -05:00
}
2013-04-08 14:41:23 -04:00
return 0 ;
2012-12-02 08:10:19 -05:00
}
bool LoopVectorizationLegality : : isUniform ( Value * V ) {
return ( SE - > isLoopInvariant ( SE - > getSCEV ( V ) , TheLoop ) ) ;
}
2013-04-08 14:41:23 -04:00
InnerLoopVectorizer : : VectorParts &
InnerLoopVectorizer : : getVectorValue ( Value * V ) {
assert ( V ! = Induction & & " The new induction variable should not be used. " ) ;
2012-12-02 08:10:19 -05:00
assert ( ! V - > getType ( ) - > isVectorTy ( ) & & " Can't widen a vector " ) ;
2014-11-24 04:08:18 -05:00
// If we have a stride that is replaced by one, do it here.
if ( Legal - > hasStride ( V ) )
V = ConstantInt : : get ( V - > getType ( ) , 1 ) ;
2013-04-08 14:41:23 -04:00
// If we have this scalar in the map, return it.
if ( WidenMap . has ( V ) )
return WidenMap . get ( V ) ;
// If this scalar is unknown, assume that it is a constant or that it is
// loop invariant. Broadcast V and save the value for future uses.
2012-12-02 08:10:19 -05:00
Value * B = getBroadcastInstrs ( V ) ;
2013-04-08 14:41:23 -04:00
return WidenMap . splat ( V , B ) ;
2012-12-02 08:10:19 -05:00
}
2013-04-08 14:41:23 -04:00
Value * InnerLoopVectorizer : : reverseVector ( Value * Vec ) {
assert ( Vec - > getType ( ) - > isVectorTy ( ) & & " Invalid type " ) ;
SmallVector < Constant * , 8 > ShuffleMask ;
2012-12-02 08:10:19 -05:00
for ( unsigned i = 0 ; i < VF ; + + i )
2013-04-08 14:41:23 -04:00
ShuffleMask . push_back ( Builder . getInt32 ( VF - i - 1 ) ) ;
2012-12-02 08:10:19 -05:00
2013-04-08 14:41:23 -04:00
return Builder . CreateShuffleVector ( Vec , UndefValue : : get ( Vec - > getType ( ) ) ,
ConstantVector : : get ( ShuffleMask ) ,
" reverse " ) ;
}
2014-11-24 04:08:18 -05:00
void InnerLoopVectorizer : : vectorizeMemoryInstruction ( Instruction * Instr ) {
2013-04-08 14:41:23 -04:00
// Attempt to issue a wide load.
LoadInst * LI = dyn_cast < LoadInst > ( Instr ) ;
StoreInst * SI = dyn_cast < StoreInst > ( Instr ) ;
assert ( ( LI | | SI ) & & " Invalid Load/Store instruction " ) ;
Type * ScalarDataTy = LI ? LI - > getType ( ) : SI - > getValueOperand ( ) - > getType ( ) ;
Type * DataTy = VectorType : : get ( ScalarDataTy , VF ) ;
Value * Ptr = LI ? LI - > getPointerOperand ( ) : SI - > getPointerOperand ( ) ;
unsigned Alignment = LI ? LI - > getAlignment ( ) : SI - > getAlignment ( ) ;
2013-12-21 19:04:03 -05:00
// An alignment of 0 means target abi alignment. We need to use the scalar's
// target abi alignment in such a case.
if ( ! Alignment )
Alignment = DL - > getABITypeAlignment ( ScalarDataTy ) ;
unsigned AddressSpace = Ptr - > getType ( ) - > getPointerAddressSpace ( ) ;
2013-06-10 16:36:52 -04:00
unsigned ScalarAllocatedSize = DL - > getTypeAllocSize ( ScalarDataTy ) ;
unsigned VectorElementSize = DL - > getTypeStoreSize ( DataTy ) / VF ;
2015-01-18 11:17:27 -05:00
if ( SI & & Legal - > blockNeedsPredication ( SI - > getParent ( ) ) & &
! Legal - > isMaskRequired ( SI ) )
2014-11-24 04:08:18 -05:00
return scalarizeInstruction ( Instr , true ) ;
2013-06-10 16:36:52 -04:00
if ( ScalarAllocatedSize ! = VectorElementSize )
return scalarizeInstruction ( Instr ) ;
2014-11-24 04:08:18 -05:00
// If the pointer is loop invariant or if it is non-consecutive,
2013-04-08 14:41:23 -04:00
// scalarize the load.
2013-06-10 16:36:52 -04:00
int ConsecutiveStride = Legal - > isConsecutivePtr ( Ptr ) ;
bool Reverse = ConsecutiveStride < 0 ;
2013-04-08 14:41:23 -04:00
bool UniformLoad = LI & & Legal - > isUniform ( Ptr ) ;
2013-06-10 16:36:52 -04:00
if ( ! ConsecutiveStride | | UniformLoad )
2013-04-08 14:41:23 -04:00
return scalarizeInstruction ( Instr ) ;
Constant * Zero = Builder . getInt32 ( 0 ) ;
VectorParts & Entry = WidenMap . get ( Instr ) ;
// Handle consecutive loads/stores.
GetElementPtrInst * Gep = dyn_cast < GetElementPtrInst > ( Ptr ) ;
if ( Gep & & Legal - > isInductionVariable ( Gep - > getPointerOperand ( ) ) ) {
2013-12-21 19:04:03 -05:00
setDebugLocFromInst ( Builder , Gep ) ;
2013-04-08 14:41:23 -04:00
Value * PtrOperand = Gep - > getPointerOperand ( ) ;
Value * FirstBasePtr = getVectorValue ( PtrOperand ) [ 0 ] ;
FirstBasePtr = Builder . CreateExtractElement ( FirstBasePtr , Zero ) ;
// Create the new GEP with the new induction variable.
GetElementPtrInst * Gep2 = cast < GetElementPtrInst > ( Gep - > clone ( ) ) ;
Gep2 - > setOperand ( 0 , FirstBasePtr ) ;
Gep2 - > setName ( " gep.indvar.base " ) ;
Ptr = Builder . Insert ( Gep2 ) ;
} else if ( Gep ) {
2013-12-21 19:04:03 -05:00
setDebugLocFromInst ( Builder , Gep ) ;
2013-04-08 14:41:23 -04:00
assert ( SE - > isLoopInvariant ( SE - > getSCEV ( Gep - > getPointerOperand ( ) ) ,
OrigLoop ) & & " Base ptr must be invariant " ) ;
// The last index does not have to be the induction. It can be
// consecutive and be a function of the index. For example A[I+1];
unsigned NumOperands = Gep - > getNumOperands ( ) ;
2013-12-21 19:04:03 -05:00
unsigned InductionOperand = getGEPInductionOperand ( DL , Gep ) ;
2013-04-08 14:41:23 -04:00
// Create the new GEP with the new induction variable.
GetElementPtrInst * Gep2 = cast < GetElementPtrInst > ( Gep - > clone ( ) ) ;
2013-12-21 19:04:03 -05:00
for ( unsigned i = 0 ; i < NumOperands ; + + i ) {
Value * GepOperand = Gep - > getOperand ( i ) ;
Instruction * GepOperandInst = dyn_cast < Instruction > ( GepOperand ) ;
// Update last index or loop invariant instruction anchored in loop.
if ( i = = InductionOperand | |
( GepOperandInst & & OrigLoop - > contains ( GepOperandInst ) ) ) {
assert ( ( i = = InductionOperand | |
SE - > isLoopInvariant ( SE - > getSCEV ( GepOperandInst ) , OrigLoop ) ) & &
" Must be last index or loop invariant " ) ;
VectorParts & GEPParts = getVectorValue ( GepOperand ) ;
Value * Index = GEPParts [ 0 ] ;
Index = Builder . CreateExtractElement ( Index , Zero ) ;
Gep2 - > setOperand ( i , Index ) ;
Gep2 - > setName ( " gep.indvar.idx " ) ;
}
}
2013-04-08 14:41:23 -04:00
Ptr = Builder . Insert ( Gep2 ) ;
} else {
// Use the induction element ptr.
assert ( isa < PHINode > ( Ptr ) & & " Invalid induction ptr " ) ;
2013-12-21 19:04:03 -05:00
setDebugLocFromInst ( Builder , Ptr ) ;
2013-04-08 14:41:23 -04:00
VectorParts & PtrVal = getVectorValue ( Ptr ) ;
Ptr = Builder . CreateExtractElement ( PtrVal [ 0 ] , Zero ) ;
}
2015-01-18 11:17:27 -05:00
VectorParts Mask = createBlockInMask ( Instr - > getParent ( ) ) ;
2013-04-08 14:41:23 -04:00
// Handle Stores:
if ( SI ) {
assert ( ! Legal - > isUniform ( SI - > getPointerOperand ( ) ) & &
" We do not allow storing to uniform addresses " ) ;
2013-12-21 19:04:03 -05:00
setDebugLocFromInst ( Builder , SI ) ;
// We don't want to update the value in the map as it might be used in
// another expression. So don't use a reference type for "StoredVal".
VectorParts StoredVal = getVectorValue ( SI - > getValueOperand ( ) ) ;
2015-01-18 11:17:27 -05:00
2013-04-08 14:41:23 -04:00
for ( unsigned Part = 0 ; Part < UF ; + + Part ) {
// Calculate the pointer for the specific unroll-part.
Value * PartPtr = Builder . CreateGEP ( Ptr , Builder . getInt32 ( Part * VF ) ) ;
if ( Reverse ) {
// If we store to reverse consecutive memory locations then we need
// to reverse the order of elements in the stored value.
StoredVal [ Part ] = reverseVector ( StoredVal [ Part ] ) ;
// If the address is consecutive but reversed, then the
// wide store needs to start at the last vector element.
PartPtr = Builder . CreateGEP ( Ptr , Builder . getInt32 ( - Part * VF ) ) ;
PartPtr = Builder . CreateGEP ( PartPtr , Builder . getInt32 ( 1 - VF ) ) ;
}
2013-12-21 19:04:03 -05:00
Value * VecPtr = Builder . CreateBitCast ( PartPtr ,
DataTy - > getPointerTo ( AddressSpace ) ) ;
2015-01-18 11:17:27 -05:00
Instruction * NewSI ;
if ( Legal - > isMaskRequired ( SI ) )
NewSI = Builder . CreateMaskedStore ( StoredVal [ Part ] , VecPtr , Alignment ,
Mask [ Part ] ) ;
else
NewSI = Builder . CreateAlignedStore ( StoredVal [ Part ] , VecPtr , Alignment ) ;
2014-11-24 04:08:18 -05:00
propagateMetadata ( NewSI , SI ) ;
2013-04-08 14:41:23 -04:00
}
2013-12-21 19:04:03 -05:00
return ;
2013-04-08 14:41:23 -04:00
}
2013-12-21 19:04:03 -05:00
// Handle loads.
assert ( LI & & " Must have a load instruction " ) ;
setDebugLocFromInst ( Builder , LI ) ;
2013-04-08 14:41:23 -04:00
for ( unsigned Part = 0 ; Part < UF ; + + Part ) {
// Calculate the pointer for the specific unroll-part.
Value * PartPtr = Builder . CreateGEP ( Ptr , Builder . getInt32 ( Part * VF ) ) ;
if ( Reverse ) {
// If the address is consecutive but reversed, then the
2015-01-18 11:17:27 -05:00
// wide load needs to start at the last vector element.
2013-04-08 14:41:23 -04:00
PartPtr = Builder . CreateGEP ( Ptr , Builder . getInt32 ( - Part * VF ) ) ;
PartPtr = Builder . CreateGEP ( PartPtr , Builder . getInt32 ( 1 - VF ) ) ;
}
2015-01-18 11:17:27 -05:00
Instruction * NewLI ;
2013-12-21 19:04:03 -05:00
Value * VecPtr = Builder . CreateBitCast ( PartPtr ,
DataTy - > getPointerTo ( AddressSpace ) ) ;
2015-01-18 11:17:27 -05:00
if ( Legal - > isMaskRequired ( LI ) )
NewLI = Builder . CreateMaskedLoad ( VecPtr , Alignment , Mask [ Part ] ,
UndefValue : : get ( DataTy ) ,
" wide.masked.load " ) ;
else
NewLI = Builder . CreateAlignedLoad ( VecPtr , Alignment , " wide.load " ) ;
2014-11-24 04:08:18 -05:00
propagateMetadata ( NewLI , LI ) ;
Entry [ Part ] = Reverse ? reverseVector ( NewLI ) : NewLI ;
2013-04-08 14:41:23 -04:00
}
2012-12-02 08:10:19 -05:00
}
2014-11-24 04:08:18 -05:00
void InnerLoopVectorizer : : scalarizeInstruction ( Instruction * Instr , bool IfPredicateStore ) {
2012-12-02 08:10:19 -05:00
assert ( ! Instr - > getType ( ) - > isAggregateType ( ) & & " Can't handle vectors " ) ;
// Holds vector parameters or scalars, in case of uniform vals.
2013-04-08 14:41:23 -04:00
SmallVector < VectorParts , 4 > Params ;
2012-12-02 08:10:19 -05:00
2013-12-21 19:04:03 -05:00
setDebugLocFromInst ( Builder , Instr ) ;
2012-12-02 08:10:19 -05:00
// Find all of the vectorized parameters.
for ( unsigned op = 0 , e = Instr - > getNumOperands ( ) ; op ! = e ; + + op ) {
Value * SrcOp = Instr - > getOperand ( op ) ;
// If we are accessing the old induction variable, use the new one.
if ( SrcOp = = OldInduction ) {
2013-04-08 14:41:23 -04:00
Params . push_back ( getVectorValue ( SrcOp ) ) ;
2012-12-02 08:10:19 -05:00
continue ;
}
// Try using previously calculated values.
Instruction * SrcInst = dyn_cast < Instruction > ( SrcOp ) ;
// If the src is an instruction that appeared earlier in the basic block
// then it should already be vectorized.
2013-04-08 14:41:23 -04:00
if ( SrcInst & & OrigLoop - > contains ( SrcInst ) ) {
assert ( WidenMap . has ( SrcInst ) & & " Source operand is unavailable " ) ;
2012-12-02 08:10:19 -05:00
// The parameter is a vector value from earlier.
2013-04-08 14:41:23 -04:00
Params . push_back ( WidenMap . get ( SrcInst ) ) ;
2012-12-02 08:10:19 -05:00
} else {
// The parameter is a scalar from outside the loop. Maybe even a constant.
2013-04-08 14:41:23 -04:00
VectorParts Scalars ;
Scalars . append ( UF , SrcOp ) ;
Params . push_back ( Scalars ) ;
2012-12-02 08:10:19 -05:00
}
}
assert ( Params . size ( ) = = Instr - > getNumOperands ( ) & &
" Invalid number of operands " ) ;
// Does this instruction return a value ?
bool IsVoidRetTy = Instr - > getType ( ) - > isVoidTy ( ) ;
2014-11-24 04:08:18 -05:00
Value * UndefVec = IsVoidRetTy ? nullptr :
2013-04-08 14:41:23 -04:00
UndefValue : : get ( VectorType : : get ( Instr - > getType ( ) , VF ) ) ;
// Create a new entry in the WidenMap and initialize it to Undef or Null.
VectorParts & VecResults = WidenMap . splat ( Instr , UndefVec ) ;
2012-12-02 08:10:19 -05:00
2014-11-24 04:08:18 -05:00
Instruction * InsertPt = Builder . GetInsertPoint ( ) ;
BasicBlock * IfBlock = Builder . GetInsertBlock ( ) ;
BasicBlock * CondBlock = nullptr ;
VectorParts Cond ;
Loop * VectorLp = nullptr ;
if ( IfPredicateStore ) {
assert ( Instr - > getParent ( ) - > getSinglePredecessor ( ) & &
" Only support single predecessor blocks " ) ;
Cond = createEdgeMask ( Instr - > getParent ( ) - > getSinglePredecessor ( ) ,
Instr - > getParent ( ) ) ;
VectorLp = LI - > getLoopFor ( IfBlock ) ;
assert ( VectorLp & & " Must have a loop for this block " ) ;
}
2013-06-10 16:36:52 -04:00
// For each vector unroll 'part':
for ( unsigned Part = 0 ; Part < UF ; + + Part ) {
// For each scalar that we create:
for ( unsigned Width = 0 ; Width < VF ; + + Width ) {
2014-11-24 04:08:18 -05:00
// Start if-block.
Value * Cmp = nullptr ;
if ( IfPredicateStore ) {
Cmp = Builder . CreateExtractElement ( Cond [ Part ] , Builder . getInt32 ( Width ) ) ;
Cmp = Builder . CreateICmp ( ICmpInst : : ICMP_EQ , Cmp , ConstantInt : : get ( Cmp - > getType ( ) , 1 ) ) ;
CondBlock = IfBlock - > splitBasicBlock ( InsertPt , " cond.store " ) ;
LoopVectorBody . push_back ( CondBlock ) ;
VectorLp - > addBasicBlockToLoop ( CondBlock , LI - > getBase ( ) ) ;
// Update Builder with newly created basic block.
Builder . SetInsertPoint ( InsertPt ) ;
}
2013-04-08 14:41:23 -04:00
Instruction * Cloned = Instr - > clone ( ) ;
if ( ! IsVoidRetTy )
Cloned - > setName ( Instr - > getName ( ) + " .cloned " ) ;
2013-12-21 19:04:03 -05:00
// Replace the operands of the cloned instructions with extracted scalars.
2013-04-08 14:41:23 -04:00
for ( unsigned op = 0 , e = Instr - > getNumOperands ( ) ; op ! = e ; + + op ) {
Value * Op = Params [ op ] [ Part ] ;
// Param is a vector. Need to extract the right lane.
if ( Op - > getType ( ) - > isVectorTy ( ) )
Op = Builder . CreateExtractElement ( Op , Builder . getInt32 ( Width ) ) ;
Cloned - > setOperand ( op , Op ) ;
}
// Place the cloned scalar in the new loop.
Builder . Insert ( Cloned ) ;
// If the original scalar returns a value we need to place it in a vector
// so that future users will be able to use it.
if ( ! IsVoidRetTy )
VecResults [ Part ] = Builder . CreateInsertElement ( VecResults [ Part ] , Cloned ,
Builder . getInt32 ( Width ) ) ;
2014-11-24 04:08:18 -05:00
// End if-block.
if ( IfPredicateStore ) {
BasicBlock * NewIfBlock = CondBlock - > splitBasicBlock ( InsertPt , " else " ) ;
LoopVectorBody . push_back ( NewIfBlock ) ;
VectorLp - > addBasicBlockToLoop ( NewIfBlock , LI - > getBase ( ) ) ;
Builder . SetInsertPoint ( InsertPt ) ;
Instruction * OldBr = IfBlock - > getTerminator ( ) ;
BranchInst : : Create ( CondBlock , NewIfBlock , Cmp , OldBr ) ;
OldBr - > eraseFromParent ( ) ;
IfBlock = NewIfBlock ;
}
2013-04-08 14:41:23 -04:00
}
}
}
2014-11-24 04:08:18 -05:00
static Instruction * getFirstInst ( Instruction * FirstInst , Value * V ,
Instruction * Loc ) {
if ( FirstInst )
return FirstInst ;
if ( Instruction * I = dyn_cast < Instruction > ( V ) )
return I - > getParent ( ) = = Loc - > getParent ( ) ? I : nullptr ;
return nullptr ;
}
std : : pair < Instruction * , Instruction * >
InnerLoopVectorizer : : addStrideCheck ( Instruction * Loc ) {
Instruction * tnullptr = nullptr ;
if ( ! Legal - > mustCheckStrides ( ) )
return std : : pair < Instruction * , Instruction * > ( tnullptr , tnullptr ) ;
IRBuilder < > ChkBuilder ( Loc ) ;
// Emit checks.
Value * Check = nullptr ;
Instruction * FirstInst = nullptr ;
for ( SmallPtrSet < Value * , 8 > : : iterator SI = Legal - > strides_begin ( ) ,
SE = Legal - > strides_end ( ) ;
SI ! = SE ; + + SI ) {
Value * Ptr = stripIntegerCast ( * SI ) ;
Value * C = ChkBuilder . CreateICmpNE ( Ptr , ConstantInt : : get ( Ptr - > getType ( ) , 1 ) ,
" stride.chk " ) ;
// Store the first instruction we create.
FirstInst = getFirstInst ( FirstInst , C , Loc ) ;
if ( Check )
Check = ChkBuilder . CreateOr ( Check , C ) ;
else
Check = C ;
}
// We have to do this trickery because the IRBuilder might fold the check to a
// constant expression in which case there is no Instruction anchored in a
// the block.
LLVMContext & Ctx = Loc - > getContext ( ) ;
Instruction * TheCheck =
BinaryOperator : : CreateAnd ( Check , ConstantInt : : getTrue ( Ctx ) ) ;
ChkBuilder . Insert ( TheCheck , " stride.not.one " ) ;
FirstInst = getFirstInst ( FirstInst , TheCheck , Loc ) ;
return std : : make_pair ( FirstInst , TheCheck ) ;
}
std : : pair < Instruction * , Instruction * >
InnerLoopVectorizer : : addRuntimeCheck ( Instruction * Loc ) {
2013-04-08 14:41:23 -04:00
LoopVectorizationLegality : : RuntimePointerCheck * PtrRtCheck =
Legal - > getRuntimePointerCheck ( ) ;
2014-11-24 04:08:18 -05:00
Instruction * tnullptr = nullptr ;
2013-04-08 14:41:23 -04:00
if ( ! PtrRtCheck - > Need )
2014-11-24 04:08:18 -05:00
return std : : pair < Instruction * , Instruction * > ( tnullptr , tnullptr ) ;
2013-04-08 14:41:23 -04:00
unsigned NumPointers = PtrRtCheck - > Pointers . size ( ) ;
2013-12-21 19:04:03 -05:00
SmallVector < TrackingVH < Value > , 2 > Starts ;
SmallVector < TrackingVH < Value > , 2 > Ends ;
2013-04-08 14:41:23 -04:00
2013-12-21 19:04:03 -05:00
LLVMContext & Ctx = Loc - > getContext ( ) ;
2013-04-08 14:41:23 -04:00
SCEVExpander Exp ( * SE , " induction " ) ;
2014-11-24 04:08:18 -05:00
Instruction * FirstInst = nullptr ;
2013-04-08 14:41:23 -04:00
for ( unsigned i = 0 ; i < NumPointers ; + + i ) {
Value * Ptr = PtrRtCheck - > Pointers [ i ] ;
const SCEV * Sc = SE - > getSCEV ( Ptr ) ;
if ( SE - > isLoopInvariant ( Sc , OrigLoop ) ) {
DEBUG ( dbgs ( ) < < " LV: Adding RT check for a loop invariant ptr: " < <
* Ptr < < " \n " ) ;
Starts . push_back ( Ptr ) ;
Ends . push_back ( Ptr ) ;
} else {
2013-12-21 19:04:03 -05:00
DEBUG ( dbgs ( ) < < " LV: Adding RT check for range: " < < * Ptr < < ' \n ' ) ;
unsigned AS = Ptr - > getType ( ) - > getPointerAddressSpace ( ) ;
// Use this type for pointer arithmetic.
Type * PtrArithTy = Type : : getInt8PtrTy ( Ctx , AS ) ;
2013-04-08 14:41:23 -04:00
Value * Start = Exp . expandCodeFor ( PtrRtCheck - > Starts [ i ] , PtrArithTy , Loc ) ;
Value * End = Exp . expandCodeFor ( PtrRtCheck - > Ends [ i ] , PtrArithTy , Loc ) ;
Starts . push_back ( Start ) ;
Ends . push_back ( End ) ;
2012-12-02 08:10:19 -05:00
}
2013-04-08 14:41:23 -04:00
}
2012-12-02 08:10:19 -05:00
2013-04-08 14:41:23 -04:00
IRBuilder < > ChkBuilder ( Loc ) ;
2013-12-21 19:04:03 -05:00
// Our instructions might fold to a constant.
2014-11-24 04:08:18 -05:00
Value * MemoryRuntimeCheck = nullptr ;
2013-04-08 14:41:23 -04:00
for ( unsigned i = 0 ; i < NumPointers ; + + i ) {
for ( unsigned j = i + 1 ; j < NumPointers ; + + j ) {
2013-06-10 16:36:52 -04:00
// No need to check if two readonly pointers intersect.
if ( ! PtrRtCheck - > IsWritePtr [ i ] & & ! PtrRtCheck - > IsWritePtr [ j ] )
continue ;
2013-12-21 19:04:03 -05:00
// Only need to check pointers between two different dependency sets.
if ( PtrRtCheck - > DependencySetId [ i ] = = PtrRtCheck - > DependencySetId [ j ] )
continue ;
2014-11-24 04:08:18 -05:00
// Only need to check pointers in the same alias set.
if ( PtrRtCheck - > AliasSetId [ i ] ! = PtrRtCheck - > AliasSetId [ j ] )
continue ;
2013-12-21 19:04:03 -05:00
unsigned AS0 = Starts [ i ] - > getType ( ) - > getPointerAddressSpace ( ) ;
unsigned AS1 = Starts [ j ] - > getType ( ) - > getPointerAddressSpace ( ) ;
assert ( ( AS0 = = Ends [ j ] - > getType ( ) - > getPointerAddressSpace ( ) ) & &
( AS1 = = Ends [ i ] - > getType ( ) - > getPointerAddressSpace ( ) ) & &
" Trying to bounds check pointers with different address spaces " ) ;
Type * PtrArithTy0 = Type : : getInt8PtrTy ( Ctx , AS0 ) ;
Type * PtrArithTy1 = Type : : getInt8PtrTy ( Ctx , AS1 ) ;
Value * Start0 = ChkBuilder . CreateBitCast ( Starts [ i ] , PtrArithTy0 , " bc " ) ;
Value * Start1 = ChkBuilder . CreateBitCast ( Starts [ j ] , PtrArithTy1 , " bc " ) ;
Value * End0 = ChkBuilder . CreateBitCast ( Ends [ i ] , PtrArithTy1 , " bc " ) ;
Value * End1 = ChkBuilder . CreateBitCast ( Ends [ j ] , PtrArithTy0 , " bc " ) ;
2013-04-08 14:41:23 -04:00
Value * Cmp0 = ChkBuilder . CreateICmpULE ( Start0 , End1 , " bound0 " ) ;
2014-11-24 04:08:18 -05:00
FirstInst = getFirstInst ( FirstInst , Cmp0 , Loc ) ;
2013-04-08 14:41:23 -04:00
Value * Cmp1 = ChkBuilder . CreateICmpULE ( Start1 , End0 , " bound1 " ) ;
2014-11-24 04:08:18 -05:00
FirstInst = getFirstInst ( FirstInst , Cmp1 , Loc ) ;
2013-04-08 14:41:23 -04:00
Value * IsConflict = ChkBuilder . CreateAnd ( Cmp0 , Cmp1 , " found.conflict " ) ;
2014-11-24 04:08:18 -05:00
FirstInst = getFirstInst ( FirstInst , IsConflict , Loc ) ;
if ( MemoryRuntimeCheck ) {
2013-04-08 14:41:23 -04:00
IsConflict = ChkBuilder . CreateOr ( MemoryRuntimeCheck , IsConflict ,
" conflict.rdx " ) ;
2014-11-24 04:08:18 -05:00
FirstInst = getFirstInst ( FirstInst , IsConflict , Loc ) ;
}
2013-12-21 19:04:03 -05:00
MemoryRuntimeCheck = IsConflict ;
2013-04-08 14:41:23 -04:00
}
2012-12-02 08:10:19 -05:00
}
2013-12-21 19:04:03 -05:00
// We have to do this trickery because the IRBuilder might fold the check to a
// constant expression in which case there is no Instruction anchored in a
// the block.
Instruction * Check = BinaryOperator : : CreateAnd ( MemoryRuntimeCheck ,
ConstantInt : : getTrue ( Ctx ) ) ;
ChkBuilder . Insert ( Check , " memcheck.conflict " ) ;
2014-11-24 04:08:18 -05:00
FirstInst = getFirstInst ( FirstInst , Check , Loc ) ;
return std : : make_pair ( FirstInst , Check ) ;
2012-12-02 08:10:19 -05:00
}
2014-11-24 04:08:18 -05:00
void InnerLoopVectorizer : : createEmptyLoop ( ) {
2012-12-02 08:10:19 -05:00
/*
In this function we generate a new loop . The new loop will contain
the vectorized instructions while the old loop will continue to run the
scalar remainder .
2014-11-24 04:08:18 -05:00
[ ] < - - Back - edge taken count overflow check .
/ |
/ v
| [ ] < - - vector loop bypass ( may consist of multiple blocks ) .
| / |
| / v
| | [ ] < - - vector pre header .
| | |
| | v
| | [ ] \
| | [ ] _ | < - - vector loop .
| | |
| \ v
| > [ ] < - - - middle - block .
| / |
| / v
- | - > [ ] < - - - new preheader .
2013-04-08 14:41:23 -04:00
| |
| v
| [ ] \
| [ ] _ | < - - old scalar loop to handle remainder .
\ |
\ v
> [ ] < - - exit block .
2012-12-02 08:10:19 -05:00
. . .
*/
2013-04-08 14:41:23 -04:00
BasicBlock * OldBasicBlock = OrigLoop - > getHeader ( ) ;
BasicBlock * BypassBlock = OrigLoop - > getLoopPreheader ( ) ;
BasicBlock * ExitBlock = OrigLoop - > getExitBlock ( ) ;
2014-11-24 04:08:18 -05:00
assert ( BypassBlock & & " Invalid loop structure " ) ;
2013-04-08 14:41:23 -04:00
assert ( ExitBlock & & " Must have an exit block " ) ;
// Some loops have a single integer induction variable, while other loops
// don't. One example is c++ iterators that often have multiple pointer
// induction variables. In the code below we also support a case where we
// don't have a single induction variable.
2012-12-02 08:10:19 -05:00
OldInduction = Legal - > getInduction ( ) ;
2013-12-21 19:04:03 -05:00
Type * IdxTy = Legal - > getWidestInductionType ( ) ;
2012-12-02 08:10:19 -05:00
// Find the loop boundaries.
2013-12-21 19:04:03 -05:00
const SCEV * ExitCount = SE - > getBackedgeTakenCount ( OrigLoop ) ;
2012-12-02 08:10:19 -05:00
assert ( ExitCount ! = SE - > getCouldNotCompute ( ) & & " Invalid loop count " ) ;
2013-12-21 19:04:03 -05:00
// The exit count might have the type of i64 while the phi is i32. This can
// happen if we have an induction variable that is sign extended before the
// compare. The only way that we get a backedge taken count is that the
// induction variable was signed and as such will not overflow. In such a case
// truncation is legal.
if ( ExitCount - > getType ( ) - > getPrimitiveSizeInBits ( ) >
IdxTy - > getPrimitiveSizeInBits ( ) )
ExitCount = SE - > getTruncateOrNoop ( ExitCount , IdxTy ) ;
2014-11-24 04:08:18 -05:00
const SCEV * BackedgeTakeCount = SE - > getNoopOrZeroExtend ( ExitCount , IdxTy ) ;
2012-12-02 08:10:19 -05:00
// Get the total trip count from the count by adding 1.
2014-11-24 04:08:18 -05:00
ExitCount = SE - > getAddExpr ( BackedgeTakeCount ,
SE - > getConstant ( BackedgeTakeCount - > getType ( ) , 1 ) ) ;
2012-12-02 08:10:19 -05:00
2013-04-08 14:41:23 -04:00
// Expand the trip count and place the new instructions in the preheader.
// Notice that the pre-header does not change, only the loop body.
SCEVExpander Exp ( * SE , " induction " ) ;
2014-11-24 04:08:18 -05:00
// We need to test whether the backedge-taken count is uint##_max. Adding one
// to it will cause overflow and an incorrect loop trip count in the vector
// body. In case of overflow we want to directly jump to the scalar remainder
// loop.
Value * BackedgeCount =
Exp . expandCodeFor ( BackedgeTakeCount , BackedgeTakeCount - > getType ( ) ,
BypassBlock - > getTerminator ( ) ) ;
if ( BackedgeCount - > getType ( ) - > isPointerTy ( ) )
BackedgeCount = CastInst : : CreatePointerCast ( BackedgeCount , IdxTy ,
" backedge.ptrcnt.to.int " ,
BypassBlock - > getTerminator ( ) ) ;
Instruction * CheckBCOverflow =
CmpInst : : Create ( Instruction : : ICmp , CmpInst : : ICMP_EQ , BackedgeCount ,
Constant : : getAllOnesValue ( BackedgeCount - > getType ( ) ) ,
" backedge.overflow " , BypassBlock - > getTerminator ( ) ) ;
2012-12-02 08:10:19 -05:00
2013-04-08 14:41:23 -04:00
// The loop index does not have to start at Zero. Find the original start
// value from the induction PHI node. If we don't have an induction variable
// then we know that it starts at zero.
2013-12-21 19:04:03 -05:00
Builder . SetInsertPoint ( BypassBlock - > getTerminator ( ) ) ;
Value * StartIdx = ExtendedIdx = OldInduction ?
Builder . CreateZExt ( OldInduction - > getIncomingValueForBlock ( BypassBlock ) ,
IdxTy ) :
ConstantInt : : get ( IdxTy , 0 ) ;
2012-12-02 08:10:19 -05:00
2014-11-24 04:08:18 -05:00
// We need an instruction to anchor the overflow check on. StartIdx needs to
// be defined before the overflow check branch. Because the scalar preheader
// is going to merge the start index and so the overflow branch block needs to
// contain a definition of the start index.
Instruction * OverflowCheckAnchor = BinaryOperator : : CreateAdd (
StartIdx , ConstantInt : : get ( IdxTy , 0 ) , " overflow.check.anchor " ,
BypassBlock - > getTerminator ( ) ) ;
// Count holds the overall loop count (N).
Value * Count = Exp . expandCodeFor ( ExitCount , ExitCount - > getType ( ) ,
BypassBlock - > getTerminator ( ) ) ;
2013-04-08 14:41:23 -04:00
LoopBypassBlocks . push_back ( BypassBlock ) ;
2012-12-02 08:10:19 -05:00
2013-04-08 14:41:23 -04:00
// Split the single block loop into the two loop structure described above.
2012-12-02 08:10:19 -05:00
BasicBlock * VectorPH =
2013-04-08 14:41:23 -04:00
BypassBlock - > splitBasicBlock ( BypassBlock - > getTerminator ( ) , " vector.ph " ) ;
BasicBlock * VecBody =
VectorPH - > splitBasicBlock ( VectorPH - > getTerminator ( ) , " vector.body " ) ;
BasicBlock * MiddleBlock =
VecBody - > splitBasicBlock ( VecBody - > getTerminator ( ) , " middle.block " ) ;
2012-12-02 08:10:19 -05:00
BasicBlock * ScalarPH =
2013-04-08 14:41:23 -04:00
MiddleBlock - > splitBasicBlock ( MiddleBlock - > getTerminator ( ) , " scalar.ph " ) ;
2012-12-02 08:10:19 -05:00
2013-12-21 19:04:03 -05:00
// Create and register the new vector loop.
Loop * Lp = new Loop ( ) ;
Loop * ParentLoop = OrigLoop - > getParentLoop ( ) ;
// Insert the new loop into the loop nest and register the new basic blocks
// before calling any utilities such as SCEV that require valid LoopInfo.
if ( ParentLoop ) {
ParentLoop - > addChildLoop ( Lp ) ;
ParentLoop - > addBasicBlockToLoop ( ScalarPH , LI - > getBase ( ) ) ;
ParentLoop - > addBasicBlockToLoop ( VectorPH , LI - > getBase ( ) ) ;
ParentLoop - > addBasicBlockToLoop ( MiddleBlock , LI - > getBase ( ) ) ;
} else {
LI - > addTopLevelLoop ( Lp ) ;
}
Lp - > addBasicBlockToLoop ( VecBody , LI - > getBase ( ) ) ;
2012-12-02 08:10:19 -05:00
// Use this IR builder to create the loop instructions (Phi, Br, Cmp)
// inside the loop.
2013-12-21 19:04:03 -05:00
Builder . SetInsertPoint ( VecBody - > getFirstNonPHI ( ) ) ;
2012-12-02 08:10:19 -05:00
// Generate the induction variable.
2013-12-21 19:04:03 -05:00
setDebugLocFromInst ( Builder , getDebugLocFromInstOrOperands ( OldInduction ) ) ;
2012-12-02 08:10:19 -05:00
Induction = Builder . CreatePHI ( IdxTy , 2 , " index " ) ;
2013-04-08 14:41:23 -04:00
// The loop step is equal to the vectorization factor (num of SIMD elements)
// times the unroll factor (num of SIMD instructions).
Constant * Step = ConstantInt : : get ( IdxTy , VF * UF ) ;
2012-12-02 08:10:19 -05:00
2013-04-08 14:41:23 -04:00
// This is the IR builder that we use to add all of the logic for bypassing
// the new vector loop.
IRBuilder < > BypassBuilder ( BypassBlock - > getTerminator ( ) ) ;
2013-12-21 19:04:03 -05:00
setDebugLocFromInst ( BypassBuilder ,
getDebugLocFromInstOrOperands ( OldInduction ) ) ;
2012-12-02 08:10:19 -05:00
2013-04-08 14:41:23 -04:00
// We may need to extend the index in case there is a type mismatch.
// We know that the count starts at zero and does not overflow.
if ( Count - > getType ( ) ! = IdxTy ) {
// The exit count can be of pointer type. Convert it to the correct
// integer type.
if ( ExitCount - > getType ( ) - > isPointerTy ( ) )
Count = BypassBuilder . CreatePointerCast ( Count , IdxTy , " ptrcnt.to.int " ) ;
else
Count = BypassBuilder . CreateZExtOrTrunc ( Count , IdxTy , " cnt.cast " ) ;
}
2012-12-02 08:10:19 -05:00
// Add the start index to the loop count to get the new end index.
2013-04-08 14:41:23 -04:00
Value * IdxEnd = BypassBuilder . CreateAdd ( Count , StartIdx , " end.idx " ) ;
2012-12-02 08:10:19 -05:00
// Now we need to generate the expression for N - (N % VF), which is
// the part that the vectorized body will execute.
2013-04-08 14:41:23 -04:00
Value * R = BypassBuilder . CreateURem ( Count , Step , " n.mod.vf " ) ;
Value * CountRoundDown = BypassBuilder . CreateSub ( Count , R , " n.vec " ) ;
Value * IdxEndRoundDown = BypassBuilder . CreateAdd ( CountRoundDown , StartIdx ,
" end.idx.rnd.down " ) ;
// Now, compare the new count to zero. If it is zero skip the vector loop and
// jump to the scalar loop.
2014-11-24 04:08:18 -05:00
Value * Cmp =
BypassBuilder . CreateICmpEQ ( IdxEndRoundDown , StartIdx , " cmp.zero " ) ;
2013-04-08 14:41:23 -04:00
BasicBlock * LastBypassBlock = BypassBlock ;
2014-11-24 04:08:18 -05:00
// Generate code to check that the loops trip count that we computed by adding
// one to the backedge-taken count will not overflow.
{
auto PastOverflowCheck =
std : : next ( BasicBlock : : iterator ( OverflowCheckAnchor ) ) ;
BasicBlock * CheckBlock =
LastBypassBlock - > splitBasicBlock ( PastOverflowCheck , " overflow.checked " ) ;
if ( ParentLoop )
ParentLoop - > addBasicBlockToLoop ( CheckBlock , LI - > getBase ( ) ) ;
LoopBypassBlocks . push_back ( CheckBlock ) ;
Instruction * OldTerm = LastBypassBlock - > getTerminator ( ) ;
BranchInst : : Create ( ScalarPH , CheckBlock , CheckBCOverflow , OldTerm ) ;
OldTerm - > eraseFromParent ( ) ;
LastBypassBlock = CheckBlock ;
}
// Generate the code to check that the strides we assumed to be one are really
// one. We want the new basic block to start at the first instruction in a
// sequence of instructions that form a check.
Instruction * StrideCheck ;
Instruction * FirstCheckInst ;
std : : tie ( FirstCheckInst , StrideCheck ) =
addStrideCheck ( LastBypassBlock - > getTerminator ( ) ) ;
if ( StrideCheck ) {
// Create a new block containing the stride check.
BasicBlock * CheckBlock =
LastBypassBlock - > splitBasicBlock ( FirstCheckInst , " vector.stridecheck " ) ;
if ( ParentLoop )
ParentLoop - > addBasicBlockToLoop ( CheckBlock , LI - > getBase ( ) ) ;
LoopBypassBlocks . push_back ( CheckBlock ) ;
// Replace the branch into the memory check block with a conditional branch
// for the "few elements case".
Instruction * OldTerm = LastBypassBlock - > getTerminator ( ) ;
BranchInst : : Create ( MiddleBlock , CheckBlock , Cmp , OldTerm ) ;
OldTerm - > eraseFromParent ( ) ;
Cmp = StrideCheck ;
LastBypassBlock = CheckBlock ;
}
2013-04-08 14:41:23 -04:00
// Generate the code that checks in runtime if arrays overlap. We put the
// checks into a separate block to make the more common case of few elements
// faster.
2014-11-24 04:08:18 -05:00
Instruction * MemRuntimeCheck ;
std : : tie ( FirstCheckInst , MemRuntimeCheck ) =
addRuntimeCheck ( LastBypassBlock - > getTerminator ( ) ) ;
2013-04-08 14:41:23 -04:00
if ( MemRuntimeCheck ) {
// Create a new block containing the memory check.
2014-11-24 04:08:18 -05:00
BasicBlock * CheckBlock =
LastBypassBlock - > splitBasicBlock ( MemRuntimeCheck , " vector.memcheck " ) ;
2013-12-21 19:04:03 -05:00
if ( ParentLoop )
ParentLoop - > addBasicBlockToLoop ( CheckBlock , LI - > getBase ( ) ) ;
2013-04-08 14:41:23 -04:00
LoopBypassBlocks . push_back ( CheckBlock ) ;
// Replace the branch into the memory check block with a conditional branch
// for the "few elements case".
2014-11-24 04:08:18 -05:00
Instruction * OldTerm = LastBypassBlock - > getTerminator ( ) ;
2013-04-08 14:41:23 -04:00
BranchInst : : Create ( MiddleBlock , CheckBlock , Cmp , OldTerm ) ;
OldTerm - > eraseFromParent ( ) ;
Cmp = MemRuntimeCheck ;
LastBypassBlock = CheckBlock ;
}
2012-12-02 08:10:19 -05:00
2013-04-08 14:41:23 -04:00
LastBypassBlock - > getTerminator ( ) - > eraseFromParent ( ) ;
BranchInst : : Create ( MiddleBlock , VectorPH , Cmp ,
LastBypassBlock ) ;
2012-12-02 08:10:19 -05:00
2013-04-08 14:41:23 -04:00
// We are going to resume the execution of the scalar loop.
// Go over all of the induction variables that we found and fix the
// PHIs that are left in the scalar version of the loop.
// The starting values of PHI nodes depend on the counter of the last
// iteration in the vectorized loop.
// If we come from a bypass edge then we need to start from the original
// start value.
// This variable saves the new starting index for the scalar loop.
2014-11-24 04:08:18 -05:00
PHINode * ResumeIndex = nullptr ;
2013-04-08 14:41:23 -04:00
LoopVectorizationLegality : : InductionList : : iterator I , E ;
LoopVectorizationLegality : : InductionList * List = Legal - > getInductionVars ( ) ;
2013-12-21 19:04:03 -05:00
// Set builder to point to last bypass block.
BypassBuilder . SetInsertPoint ( LoopBypassBlocks . back ( ) - > getTerminator ( ) ) ;
2013-04-08 14:41:23 -04:00
for ( I = List - > begin ( ) , E = List - > end ( ) ; I ! = E ; + + I ) {
PHINode * OrigPhi = I - > first ;
LoopVectorizationLegality : : InductionInfo II = I - > second ;
2013-12-21 19:04:03 -05:00
Type * ResumeValTy = ( OrigPhi = = OldInduction ) ? IdxTy : OrigPhi - > getType ( ) ;
PHINode * ResumeVal = PHINode : : Create ( ResumeValTy , 2 , " resume.val " ,
2013-04-08 14:41:23 -04:00
MiddleBlock - > getTerminator ( ) ) ;
2013-12-21 19:04:03 -05:00
// We might have extended the type of the induction variable but we need a
// truncated version for the scalar loop.
PHINode * TruncResumeVal = ( OrigPhi = = OldInduction ) ?
PHINode : : Create ( OrigPhi - > getType ( ) , 2 , " trunc.resume.val " ,
2014-11-24 04:08:18 -05:00
MiddleBlock - > getTerminator ( ) ) : nullptr ;
// Create phi nodes to merge from the backedge-taken check block.
PHINode * BCResumeVal = PHINode : : Create ( ResumeValTy , 3 , " bc.resume.val " ,
ScalarPH - > getTerminator ( ) ) ;
BCResumeVal - > addIncoming ( ResumeVal , MiddleBlock ) ;
PHINode * BCTruncResumeVal = nullptr ;
if ( OrigPhi = = OldInduction ) {
BCTruncResumeVal =
PHINode : : Create ( OrigPhi - > getType ( ) , 2 , " bc.trunc.resume.val " ,
ScalarPH - > getTerminator ( ) ) ;
BCTruncResumeVal - > addIncoming ( TruncResumeVal , MiddleBlock ) ;
}
2013-12-21 19:04:03 -05:00
2014-11-24 04:08:18 -05:00
Value * EndValue = nullptr ;
2013-04-08 14:41:23 -04:00
switch ( II . IK ) {
case LoopVectorizationLegality : : IK_NoInduction :
llvm_unreachable ( " Unknown induction " ) ;
case LoopVectorizationLegality : : IK_IntInduction : {
2013-12-21 19:04:03 -05:00
// Handle the integer induction counter.
2013-04-08 14:41:23 -04:00
assert ( OrigPhi - > getType ( ) - > isIntegerTy ( ) & & " Invalid type " ) ;
2013-12-21 19:04:03 -05:00
// We have the canonical induction variable.
if ( OrigPhi = = OldInduction ) {
// Create a truncated version of the resume value for the scalar loop,
// we might have promoted the type to a larger width.
EndValue =
BypassBuilder . CreateTrunc ( IdxEndRoundDown , OrigPhi - > getType ( ) ) ;
// The new PHI merges the original incoming value, in case of a bypass,
// or the value at the end of the vectorized loop.
2014-11-24 04:08:18 -05:00
for ( unsigned I = 1 , E = LoopBypassBlocks . size ( ) ; I ! = E ; + + I )
2013-12-21 19:04:03 -05:00
TruncResumeVal - > addIncoming ( II . StartValue , LoopBypassBlocks [ I ] ) ;
TruncResumeVal - > addIncoming ( EndValue , VecBody ) ;
2014-11-24 04:08:18 -05:00
BCTruncResumeVal - > addIncoming ( II . StartValue , LoopBypassBlocks [ 0 ] ) ;
2013-12-21 19:04:03 -05:00
// We know what the end value is.
EndValue = IdxEndRoundDown ;
// We also know which PHI node holds it.
ResumeIndex = ResumeVal ;
break ;
}
// Not the canonical induction variable - add the vector loop count to the
// start value.
Value * CRD = BypassBuilder . CreateSExtOrTrunc ( CountRoundDown ,
II . StartValue - > getType ( ) ,
" cast.crd " ) ;
EndValue = BypassBuilder . CreateAdd ( CRD , II . StartValue , " ind.end " ) ;
2013-04-08 14:41:23 -04:00
break ;
}
case LoopVectorizationLegality : : IK_ReverseIntInduction : {
// Convert the CountRoundDown variable to the PHI size.
2013-12-21 19:04:03 -05:00
Value * CRD = BypassBuilder . CreateSExtOrTrunc ( CountRoundDown ,
II . StartValue - > getType ( ) ,
" cast.crd " ) ;
// Handle reverse integer induction counter.
EndValue = BypassBuilder . CreateSub ( II . StartValue , CRD , " rev.ind.end " ) ;
2013-04-08 14:41:23 -04:00
break ;
}
case LoopVectorizationLegality : : IK_PtrInduction : {
// For pointer induction variables, calculate the offset using
// the end index.
2013-12-21 19:04:03 -05:00
EndValue = BypassBuilder . CreateGEP ( II . StartValue , CountRoundDown ,
" ptr.ind.end " ) ;
2013-04-08 14:41:23 -04:00
break ;
}
case LoopVectorizationLegality : : IK_ReversePtrInduction : {
// The value at the end of the loop for the reverse pointer is calculated
// by creating a GEP with a negative index starting from the start value.
Value * Zero = ConstantInt : : get ( CountRoundDown - > getType ( ) , 0 ) ;
2013-12-21 19:04:03 -05:00
Value * NegIdx = BypassBuilder . CreateSub ( Zero , CountRoundDown ,
" rev.ind.end " ) ;
EndValue = BypassBuilder . CreateGEP ( II . StartValue , NegIdx ,
" rev.ptr.ind.end " ) ;
2013-04-08 14:41:23 -04:00
break ;
2012-12-02 08:10:19 -05:00
}
2013-04-08 14:41:23 -04:00
} // end of case
2012-12-02 08:10:19 -05:00
2013-04-08 14:41:23 -04:00
// The new PHI merges the original incoming value, in case of a bypass,
// or the value at the end of the vectorized loop.
2014-11-24 04:08:18 -05:00
for ( unsigned I = 1 , E = LoopBypassBlocks . size ( ) ; I ! = E ; + + I ) {
2013-12-21 19:04:03 -05:00
if ( OrigPhi = = OldInduction )
ResumeVal - > addIncoming ( StartIdx , LoopBypassBlocks [ I ] ) ;
else
ResumeVal - > addIncoming ( II . StartValue , LoopBypassBlocks [ I ] ) ;
}
2013-04-08 14:41:23 -04:00
ResumeVal - > addIncoming ( EndValue , VecBody ) ;
// Fix the scalar body counter (PHI node).
unsigned BlockIdx = OrigPhi - > getBasicBlockIndex ( ScalarPH ) ;
2014-11-24 04:08:18 -05:00
// The old induction's phi node in the scalar body needs the truncated
// value.
if ( OrigPhi = = OldInduction ) {
BCResumeVal - > addIncoming ( StartIdx , LoopBypassBlocks [ 0 ] ) ;
OrigPhi - > setIncomingValue ( BlockIdx , BCTruncResumeVal ) ;
} else {
BCResumeVal - > addIncoming ( II . StartValue , LoopBypassBlocks [ 0 ] ) ;
OrigPhi - > setIncomingValue ( BlockIdx , BCResumeVal ) ;
}
2012-12-02 08:10:19 -05:00
}
2013-04-08 14:41:23 -04:00
// If we are generating a new induction variable then we also need to
// generate the code that calculates the exit value. This value is not
// simply the end of the counter because we may skip the vectorized body
// in case of a runtime check.
if ( ! OldInduction ) {
assert ( ! ResumeIndex & & " Unexpected resume value found " ) ;
ResumeIndex = PHINode : : Create ( IdxTy , 2 , " new.indc.resume.val " ,
MiddleBlock - > getTerminator ( ) ) ;
2014-11-24 04:08:18 -05:00
for ( unsigned I = 1 , E = LoopBypassBlocks . size ( ) ; I ! = E ; + + I )
2013-04-08 14:41:23 -04:00
ResumeIndex - > addIncoming ( StartIdx , LoopBypassBlocks [ I ] ) ;
ResumeIndex - > addIncoming ( IdxEndRoundDown , VecBody ) ;
}
2012-12-02 08:10:19 -05:00
2013-04-08 14:41:23 -04:00
// Make sure that we found the index where scalar loop needs to continue.
assert ( ResumeIndex & & ResumeIndex - > getType ( ) - > isIntegerTy ( ) & &
" Invalid resume Index " ) ;
2012-12-02 08:10:19 -05:00
// Add a check in the middle block to see if we have completed
// all of the iterations in the first vector loop.
// If (N - N%VF) == N, then we *don't* need to run the remainder.
Value * CmpN = CmpInst : : Create ( Instruction : : ICmp , CmpInst : : ICMP_EQ , IdxEnd ,
ResumeIndex , " cmp.n " ,
MiddleBlock - > getTerminator ( ) ) ;
BranchInst : : Create ( ExitBlock , ScalarPH , CmpN , MiddleBlock - > getTerminator ( ) ) ;
// Remove the old terminator.
MiddleBlock - > getTerminator ( ) - > eraseFromParent ( ) ;
// Create i+1 and fill the PHINode.
Value * NextIdx = Builder . CreateAdd ( Induction , Step , " index.next " ) ;
Induction - > addIncoming ( StartIdx , VectorPH ) ;
Induction - > addIncoming ( NextIdx , VecBody ) ;
// Create the compare.
Value * ICmp = Builder . CreateICmpEQ ( NextIdx , IdxEndRoundDown ) ;
Builder . CreateCondBr ( ICmp , MiddleBlock , VecBody ) ;
// Now we have two terminators. Remove the old one from the block.
VecBody - > getTerminator ( ) - > eraseFromParent ( ) ;
// Get ready to start creating new instructions into the vectorized body.
Builder . SetInsertPoint ( VecBody - > getFirstInsertionPt ( ) ) ;
// Save the state.
LoopVectorPreHeader = VectorPH ;
LoopScalarPreHeader = ScalarPH ;
LoopMiddleBlock = MiddleBlock ;
LoopExitBlock = ExitBlock ;
2014-11-24 04:08:18 -05:00
LoopVectorBody . push_back ( VecBody ) ;
2012-12-02 08:10:19 -05:00
LoopScalarBody = OldBasicBlock ;
2013-12-21 19:04:03 -05:00
LoopVectorizeHints Hints ( Lp , true ) ;
2015-01-18 11:17:27 -05:00
Hints . setAlreadyVectorized ( ) ;
2012-12-02 08:10:19 -05:00
}
/// This function returns the identity element (or neutral element) for
/// the operation K.
2013-06-10 16:36:52 -04:00
Constant *
LoopVectorizationLegality : : getReductionIdentity ( ReductionKind K , Type * Tp ) {
2012-12-02 08:10:19 -05:00
switch ( K ) {
2013-06-10 16:36:52 -04:00
case RK_IntegerXor :
case RK_IntegerAdd :
case RK_IntegerOr :
2012-12-02 08:10:19 -05:00
// Adding, Xoring, Oring zero to a number does not change it.
2013-04-08 14:41:23 -04:00
return ConstantInt : : get ( Tp , 0 ) ;
2013-06-10 16:36:52 -04:00
case RK_IntegerMult :
2012-12-02 08:10:19 -05:00
// Multiplying a number by 1 does not change it.
2013-04-08 14:41:23 -04:00
return ConstantInt : : get ( Tp , 1 ) ;
2013-06-10 16:36:52 -04:00
case RK_IntegerAnd :
2012-12-02 08:10:19 -05:00
// AND-ing a number with an all-1 value does not change it.
2013-04-08 14:41:23 -04:00
return ConstantInt : : get ( Tp , - 1 , true ) ;
2013-06-10 16:36:52 -04:00
case RK_FloatMult :
2013-04-08 14:41:23 -04:00
// Multiplying a number by 1 does not change it.
return ConstantFP : : get ( Tp , 1.0 L ) ;
2013-06-10 16:36:52 -04:00
case RK_FloatAdd :
2013-04-08 14:41:23 -04:00
// Adding zero to a number does not change it.
return ConstantFP : : get ( Tp , 0.0 L ) ;
2012-12-02 08:10:19 -05:00
default :
llvm_unreachable ( " Unknown reduction kind " ) ;
}
}
2013-04-08 14:41:23 -04:00
/// This function translates the reduction kind to an LLVM binary operator.
2013-06-10 16:36:52 -04:00
static unsigned
2013-04-08 14:41:23 -04:00
getReductionBinOp ( LoopVectorizationLegality : : ReductionKind Kind ) {
switch ( Kind ) {
case LoopVectorizationLegality : : RK_IntegerAdd :
return Instruction : : Add ;
case LoopVectorizationLegality : : RK_IntegerMult :
return Instruction : : Mul ;
case LoopVectorizationLegality : : RK_IntegerOr :
return Instruction : : Or ;
case LoopVectorizationLegality : : RK_IntegerAnd :
return Instruction : : And ;
case LoopVectorizationLegality : : RK_IntegerXor :
return Instruction : : Xor ;
case LoopVectorizationLegality : : RK_FloatMult :
return Instruction : : FMul ;
case LoopVectorizationLegality : : RK_FloatAdd :
return Instruction : : FAdd ;
2013-06-10 16:36:52 -04:00
case LoopVectorizationLegality : : RK_IntegerMinMax :
return Instruction : : ICmp ;
case LoopVectorizationLegality : : RK_FloatMinMax :
return Instruction : : FCmp ;
2013-04-08 14:41:23 -04:00
default :
llvm_unreachable ( " Unknown reduction operation " ) ;
}
}
2013-06-10 16:36:52 -04:00
Value * createMinMaxOp ( IRBuilder < > & Builder ,
LoopVectorizationLegality : : MinMaxReductionKind RK ,
Value * Left ,
Value * Right ) {
CmpInst : : Predicate P = CmpInst : : ICMP_NE ;
switch ( RK ) {
default :
llvm_unreachable ( " Unknown min/max reduction kind " ) ;
case LoopVectorizationLegality : : MRK_UIntMin :
P = CmpInst : : ICMP_ULT ;
break ;
case LoopVectorizationLegality : : MRK_UIntMax :
P = CmpInst : : ICMP_UGT ;
break ;
case LoopVectorizationLegality : : MRK_SIntMin :
P = CmpInst : : ICMP_SLT ;
break ;
case LoopVectorizationLegality : : MRK_SIntMax :
P = CmpInst : : ICMP_SGT ;
break ;
case LoopVectorizationLegality : : MRK_FloatMin :
P = CmpInst : : FCMP_OLT ;
break ;
case LoopVectorizationLegality : : MRK_FloatMax :
P = CmpInst : : FCMP_OGT ;
break ;
}
Value * Cmp ;
2013-12-21 19:04:03 -05:00
if ( RK = = LoopVectorizationLegality : : MRK_FloatMin | |
RK = = LoopVectorizationLegality : : MRK_FloatMax )
2013-06-10 16:36:52 -04:00
Cmp = Builder . CreateFCmp ( P , Left , Right , " rdx.minmax.cmp " ) ;
else
Cmp = Builder . CreateICmp ( P , Left , Right , " rdx.minmax.cmp " ) ;
Value * Select = Builder . CreateSelect ( Cmp , Left , Right , " rdx.minmax.select " ) ;
return Select ;
}
2013-12-21 19:04:03 -05:00
namespace {
struct CSEDenseMapInfo {
static bool canHandle ( Instruction * I ) {
return isa < InsertElementInst > ( I ) | | isa < ExtractElementInst > ( I ) | |
isa < ShuffleVectorInst > ( I ) | | isa < GetElementPtrInst > ( I ) ;
}
static inline Instruction * getEmptyKey ( ) {
return DenseMapInfo < Instruction * > : : getEmptyKey ( ) ;
}
static inline Instruction * getTombstoneKey ( ) {
return DenseMapInfo < Instruction * > : : getTombstoneKey ( ) ;
}
static unsigned getHashValue ( Instruction * I ) {
assert ( canHandle ( I ) & & " Unknown instruction! " ) ;
return hash_combine ( I - > getOpcode ( ) , hash_combine_range ( I - > value_op_begin ( ) ,
I - > value_op_end ( ) ) ) ;
}
static bool isEqual ( Instruction * LHS , Instruction * RHS ) {
if ( LHS = = getEmptyKey ( ) | | RHS = = getEmptyKey ( ) | |
LHS = = getTombstoneKey ( ) | | RHS = = getTombstoneKey ( ) )
return LHS = = RHS ;
return LHS - > isIdenticalTo ( RHS ) ;
}
} ;
}
2014-11-24 04:08:18 -05:00
/// \brief Check whether this block is a predicated block.
/// Due to if predication of stores we might create a sequence of "if(pred) a[i]
/// = ...; " blocks. We start with one vectorized basic block. For every
/// conditional block we split this vectorized block. Therefore, every second
/// block will be a predicated one.
static bool isPredicatedBlock ( unsigned BlockNum ) {
return BlockNum % 2 ;
}
2013-12-21 19:04:03 -05:00
///\brief Perform cse of induction variable instructions.
2014-11-24 04:08:18 -05:00
static void cse ( SmallVector < BasicBlock * , 4 > & BBs ) {
2013-12-21 19:04:03 -05:00
// Perform simple cse.
SmallDenseMap < Instruction * , Instruction * , 4 , CSEDenseMapInfo > CSEMap ;
2014-11-24 04:08:18 -05:00
for ( unsigned i = 0 , e = BBs . size ( ) ; i ! = e ; + + i ) {
BasicBlock * BB = BBs [ i ] ;
for ( BasicBlock : : iterator I = BB - > begin ( ) , E = BB - > end ( ) ; I ! = E ; ) {
Instruction * In = I + + ;
2013-12-21 19:04:03 -05:00
2014-11-24 04:08:18 -05:00
if ( ! CSEDenseMapInfo : : canHandle ( In ) )
continue ;
2013-12-21 19:04:03 -05:00
2014-11-24 04:08:18 -05:00
// Check if we can replace this instruction with any of the
// visited instructions.
if ( Instruction * V = CSEMap . lookup ( In ) ) {
In - > replaceAllUsesWith ( V ) ;
In - > eraseFromParent ( ) ;
continue ;
}
// Ignore instructions in conditional blocks. We create "if (pred) a[i] =
// ...;" blocks for predicated stores. Every second block is a predicated
// block.
if ( isPredicatedBlock ( i ) )
continue ;
CSEMap [ In ] = In ;
2013-12-21 19:04:03 -05:00
}
2014-11-24 04:08:18 -05:00
}
}
2013-12-21 19:04:03 -05:00
2014-11-24 04:08:18 -05:00
/// \brief Adds a 'fast' flag to floating point operations.
static Value * addFastMathFlag ( Value * V ) {
if ( isa < FPMathOperator > ( V ) ) {
FastMathFlags Flags ;
Flags . setUnsafeAlgebra ( ) ;
cast < Instruction > ( V ) - > setFastMathFlags ( Flags ) ;
2013-12-21 19:04:03 -05:00
}
2014-11-24 04:08:18 -05:00
return V ;
2013-12-21 19:04:03 -05:00
}
2014-11-24 04:08:18 -05:00
void InnerLoopVectorizer : : vectorizeLoop ( ) {
2012-12-02 08:10:19 -05:00
//===------------------------------------------------===//
//
// Notice: any optimization or new instruction that go
// into the code below should be also be implemented in
// the cost-model.
//
//===------------------------------------------------===//
2013-04-08 14:41:23 -04:00
Constant * Zero = Builder . getInt32 ( 0 ) ;
2012-12-02 08:10:19 -05:00
// In order to support reduction variables we need to be able to vectorize
// Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
2013-04-08 14:41:23 -04:00
// stages. First, we create a new vector PHI node with no incoming edges.
2012-12-02 08:10:19 -05:00
// We use this value when we vectorize all of the instructions that use the
// PHI. Next, after all of the instructions in the block are complete we
// add the new incoming edges to the PHI. At this point all of the
// instructions in the basic block are vectorized, so we can use them to
// construct the PHI.
2013-04-08 14:41:23 -04:00
PhiVector RdxPHIsToFix ;
2012-12-02 08:10:19 -05:00
2013-04-08 14:41:23 -04:00
// Scan the loop in a topological order to ensure that defs are vectorized
// before users.
LoopBlocksDFS DFS ( OrigLoop ) ;
DFS . perform ( LI ) ;
2012-12-02 08:10:19 -05:00
2013-04-08 14:41:23 -04:00
// Vectorize all of the blocks in the original loop.
for ( LoopBlocksDFS : : RPOIterator bb = DFS . beginRPO ( ) ,
be = DFS . endRPO ( ) ; bb ! = be ; + + bb )
2014-11-24 04:08:18 -05:00
vectorizeBlockInLoop ( * bb , & RdxPHIsToFix ) ;
2012-12-02 08:10:19 -05:00
2013-04-08 14:41:23 -04:00
// At this point every instruction in the original loop is widened to
2012-12-02 08:10:19 -05:00
// a vector form. We are almost done. Now, we need to fix the PHI nodes
// that we vectorized. The PHI nodes are currently empty because we did
// not want to introduce cycles. Notice that the remaining PHI nodes
// that we need to fix are reduction variables.
// Create the 'reduced' values for each of the induction vars.
// The reduced values are the vector values that we scalarize and combine
// after the loop is finished.
2013-04-08 14:41:23 -04:00
for ( PhiVector : : iterator it = RdxPHIsToFix . begin ( ) , e = RdxPHIsToFix . end ( ) ;
2012-12-02 08:10:19 -05:00
it ! = e ; + + it ) {
PHINode * RdxPhi = * it ;
assert ( RdxPhi & & " Unable to recover vectorized PHI " ) ;
// Find the reduction variable descriptor.
assert ( Legal - > getReductionVars ( ) - > count ( RdxPhi ) & &
" Unable to find the reduction variable " ) ;
LoopVectorizationLegality : : ReductionDescriptor RdxDesc =
2013-04-08 14:41:23 -04:00
( * Legal - > getReductionVars ( ) ) [ RdxPhi ] ;
2012-12-02 08:10:19 -05:00
2013-12-21 19:04:03 -05:00
setDebugLocFromInst ( Builder , RdxDesc . StartValue ) ;
2012-12-02 08:10:19 -05:00
// We need to generate a reduction vector from the incoming scalar.
2014-11-24 04:08:18 -05:00
// To do so, we need to generate the 'identity' vector and override
2012-12-02 08:10:19 -05:00
// one of the elements with the incoming scalar reduction. We need
// to do it in the vector-loop preheader.
2014-11-24 04:08:18 -05:00
Builder . SetInsertPoint ( LoopBypassBlocks [ 1 ] - > getTerminator ( ) ) ;
2012-12-02 08:10:19 -05:00
// This is the vector-clone of the value that leaves the loop.
2013-04-08 14:41:23 -04:00
VectorParts & VectorExit = getVectorValue ( RdxDesc . LoopExitInstr ) ;
Type * VecTy = VectorExit [ 0 ] - > getType ( ) ;
2012-12-02 08:10:19 -05:00
// Find the reduction identity variable. Zero for addition, or, xor,
// one for multiplication, -1 for And.
2013-06-10 16:36:52 -04:00
Value * Identity ;
Value * VectorStart ;
if ( RdxDesc . Kind = = LoopVectorizationLegality : : RK_IntegerMinMax | |
RdxDesc . Kind = = LoopVectorizationLegality : : RK_FloatMinMax ) {
// MinMax reduction have the start value as their identify.
2013-12-21 19:04:03 -05:00
if ( VF = = 1 ) {
VectorStart = Identity = RdxDesc . StartValue ;
} else {
VectorStart = Identity = Builder . CreateVectorSplat ( VF ,
RdxDesc . StartValue ,
" minmax.ident " ) ;
}
2013-06-10 16:36:52 -04:00
} else {
2013-12-21 19:04:03 -05:00
// Handle other reduction kinds:
2013-06-10 16:36:52 -04:00
Constant * Iden =
2013-12-21 19:04:03 -05:00
LoopVectorizationLegality : : getReductionIdentity ( RdxDesc . Kind ,
VecTy - > getScalarType ( ) ) ;
if ( VF = = 1 ) {
Identity = Iden ;
// This vector is the Identity vector where the first element is the
// incoming scalar reduction.
VectorStart = RdxDesc . StartValue ;
} else {
Identity = ConstantVector : : getSplat ( VF , Iden ) ;
// This vector is the Identity vector where the first element is the
// incoming scalar reduction.
VectorStart = Builder . CreateInsertElement ( Identity ,
RdxDesc . StartValue , Zero ) ;
}
2013-06-10 16:36:52 -04:00
}
2012-12-02 08:10:19 -05:00
// Fix the vector-loop phi.
// Reductions do not have to start at zero. They can start with
// any loop invariant values.
2013-04-08 14:41:23 -04:00
VectorParts & VecRdxPhi = WidenMap . get ( RdxPhi ) ;
BasicBlock * Latch = OrigLoop - > getLoopLatch ( ) ;
Value * LoopVal = RdxPhi - > getIncomingValueForBlock ( Latch ) ;
VectorParts & Val = getVectorValue ( LoopVal ) ;
for ( unsigned part = 0 ; part < UF ; + + part ) {
2013-12-21 19:04:03 -05:00
// Make sure to add the reduction stat value only to the
2013-04-08 14:41:23 -04:00
// first unroll part.
Value * StartVal = ( part = = 0 ) ? VectorStart : Identity ;
2015-01-18 11:17:27 -05:00
cast < PHINode > ( VecRdxPhi [ part ] ) - > addIncoming ( StartVal ,
LoopVectorPreHeader ) ;
2014-11-24 04:08:18 -05:00
cast < PHINode > ( VecRdxPhi [ part ] ) - > addIncoming ( Val [ part ] ,
LoopVectorBody . back ( ) ) ;
2013-04-08 14:41:23 -04:00
}
2012-12-02 08:10:19 -05:00
// Before each round, move the insertion point right between
// the PHIs and the values we are going to write.
// This allows us to write both PHINodes and the extractelement
// instructions.
Builder . SetInsertPoint ( LoopMiddleBlock - > getFirstInsertionPt ( ) ) ;
2013-04-08 14:41:23 -04:00
VectorParts RdxParts ;
2013-12-21 19:04:03 -05:00
setDebugLocFromInst ( Builder , RdxDesc . LoopExitInstr ) ;
2013-04-08 14:41:23 -04:00
for ( unsigned part = 0 ; part < UF ; + + part ) {
// This PHINode contains the vectorized reduction variable, or
// the initial value vector, if we bypass the vector loop.
VectorParts & RdxExitVal = getVectorValue ( RdxDesc . LoopExitInstr ) ;
PHINode * NewPhi = Builder . CreatePHI ( VecTy , 2 , " rdx.vec.exit.phi " ) ;
Value * StartVal = ( part = = 0 ) ? VectorStart : Identity ;
2014-11-24 04:08:18 -05:00
for ( unsigned I = 1 , E = LoopBypassBlocks . size ( ) ; I ! = E ; + + I )
2013-04-08 14:41:23 -04:00
NewPhi - > addIncoming ( StartVal , LoopBypassBlocks [ I ] ) ;
2014-11-24 04:08:18 -05:00
NewPhi - > addIncoming ( RdxExitVal [ part ] ,
LoopVectorBody . back ( ) ) ;
2013-04-08 14:41:23 -04:00
RdxParts . push_back ( NewPhi ) ;
2012-12-02 08:10:19 -05:00
}
2013-04-08 14:41:23 -04:00
// Reduce all of the unrolled parts into a single vector.
Value * ReducedPartRdx = RdxParts [ 0 ] ;
2013-06-10 16:36:52 -04:00
unsigned Op = getReductionBinOp ( RdxDesc . Kind ) ;
2013-12-21 19:04:03 -05:00
setDebugLocFromInst ( Builder , ReducedPartRdx ) ;
2013-04-08 14:41:23 -04:00
for ( unsigned part = 1 ; part < UF ; + + part ) {
2013-06-10 16:36:52 -04:00
if ( Op ! = Instruction : : ICmp & & Op ! = Instruction : : FCmp )
2014-11-24 04:08:18 -05:00
// Floating point operations had to be 'fast' to enable the reduction.
ReducedPartRdx = addFastMathFlag (
Builder . CreateBinOp ( ( Instruction : : BinaryOps ) Op , RdxParts [ part ] ,
ReducedPartRdx , " bin.rdx " ) ) ;
2013-06-10 16:36:52 -04:00
else
ReducedPartRdx = createMinMaxOp ( Builder , RdxDesc . MinMaxKind ,
ReducedPartRdx , RdxParts [ part ] ) ;
2013-04-08 14:41:23 -04:00
}
2013-12-21 19:04:03 -05:00
if ( VF > 1 ) {
// VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
// and vector ops, reducing the set of values being computed by half each
// round.
assert ( isPowerOf2_32 ( VF ) & &
" Reduction emission only supported for pow2 vectors! " ) ;
Value * TmpVec = ReducedPartRdx ;
2014-11-24 04:08:18 -05:00
SmallVector < Constant * , 32 > ShuffleMask ( VF , nullptr ) ;
2013-12-21 19:04:03 -05:00
for ( unsigned i = VF ; i ! = 1 ; i > > = 1 ) {
// Move the upper half of the vector to the lower half.
for ( unsigned j = 0 ; j ! = i / 2 ; + + j )
ShuffleMask [ j ] = Builder . getInt32 ( i / 2 + j ) ;
// Fill the rest of the mask with undef.
std : : fill ( & ShuffleMask [ i / 2 ] , ShuffleMask . end ( ) ,
UndefValue : : get ( Builder . getInt32Ty ( ) ) ) ;
Value * Shuf =
2013-04-08 14:41:23 -04:00
Builder . CreateShuffleVector ( TmpVec ,
UndefValue : : get ( TmpVec - > getType ( ) ) ,
ConstantVector : : get ( ShuffleMask ) ,
" rdx.shuf " ) ;
2013-12-21 19:04:03 -05:00
if ( Op ! = Instruction : : ICmp & & Op ! = Instruction : : FCmp )
2014-11-24 04:08:18 -05:00
// Floating point operations had to be 'fast' to enable the reduction.
TmpVec = addFastMathFlag ( Builder . CreateBinOp (
( Instruction : : BinaryOps ) Op , TmpVec , Shuf , " bin.rdx " ) ) ;
2013-12-21 19:04:03 -05:00
else
TmpVec = createMinMaxOp ( Builder , RdxDesc . MinMaxKind , TmpVec , Shuf ) ;
}
2013-04-08 14:41:23 -04:00
2013-12-21 19:04:03 -05:00
// The result is in the first element of the vector.
ReducedPartRdx = Builder . CreateExtractElement ( TmpVec ,
Builder . getInt32 ( 0 ) ) ;
}
2013-04-08 14:41:23 -04:00
2014-11-24 04:08:18 -05:00
// Create a phi node that merges control-flow from the backedge-taken check
// block and the middle block.
PHINode * BCBlockPhi = PHINode : : Create ( RdxPhi - > getType ( ) , 2 , " bc.merge.rdx " ,
LoopScalarPreHeader - > getTerminator ( ) ) ;
BCBlockPhi - > addIncoming ( RdxDesc . StartValue , LoopBypassBlocks [ 0 ] ) ;
BCBlockPhi - > addIncoming ( ReducedPartRdx , LoopMiddleBlock ) ;
2013-04-08 14:41:23 -04:00
// Now, we need to fix the users of the reduction variable
// inside and outside of the scalar remainder loop.
2012-12-02 08:10:19 -05:00
// We know that the loop is in LCSSA form. We need to update the
// PHI nodes in the exit blocks.
for ( BasicBlock : : iterator LEI = LoopExitBlock - > begin ( ) ,
LEE = LoopExitBlock - > end ( ) ; LEI ! = LEE ; + + LEI ) {
PHINode * LCSSAPhi = dyn_cast < PHINode > ( LEI ) ;
2013-12-21 19:04:03 -05:00
if ( ! LCSSAPhi ) break ;
2012-12-02 08:10:19 -05:00
// All PHINodes need to have a single entry edge, or two if
// we already fixed them.
assert ( LCSSAPhi - > getNumIncomingValues ( ) < 3 & & " Invalid LCSSA PHI " ) ;
// We found our reduction value exit-PHI. Update it with the
// incoming bypass edge.
if ( LCSSAPhi - > getIncomingValue ( 0 ) = = RdxDesc . LoopExitInstr ) {
// Add an edge coming from the bypass.
2013-12-21 19:04:03 -05:00
LCSSAPhi - > addIncoming ( ReducedPartRdx , LoopMiddleBlock ) ;
2012-12-02 08:10:19 -05:00
break ;
}
} // end of the LCSSA phi scan.
// Fix the scalar loop reduction variable with the incoming reduction sum
// from the vector body and from the backedge value.
2013-04-08 14:41:23 -04:00
int IncomingEdgeBlockIdx =
( RdxPhi ) - > getBasicBlockIndex ( OrigLoop - > getLoopLatch ( ) ) ;
assert ( IncomingEdgeBlockIdx > = 0 & & " Invalid block index " ) ;
// Pick the other block.
int SelfEdgeBlockIdx = ( IncomingEdgeBlockIdx ? 0 : 1 ) ;
2014-11-24 04:08:18 -05:00
( RdxPhi ) - > setIncomingValue ( SelfEdgeBlockIdx , BCBlockPhi ) ;
2012-12-02 08:10:19 -05:00
( RdxPhi ) - > setIncomingValue ( IncomingEdgeBlockIdx , RdxDesc . LoopExitInstr ) ;
} // end of for each redux variable.
2013-04-08 14:41:23 -04:00
2013-12-21 19:04:03 -05:00
fixLCSSAPHIs ( ) ;
// Remove redundant induction instructions.
cse ( LoopVectorBody ) ;
}
void InnerLoopVectorizer : : fixLCSSAPHIs ( ) {
2013-04-08 14:41:23 -04:00
for ( BasicBlock : : iterator LEI = LoopExitBlock - > begin ( ) ,
LEE = LoopExitBlock - > end ( ) ; LEI ! = LEE ; + + LEI ) {
PHINode * LCSSAPhi = dyn_cast < PHINode > ( LEI ) ;
2013-12-21 19:04:03 -05:00
if ( ! LCSSAPhi ) break ;
2013-04-08 14:41:23 -04:00
if ( LCSSAPhi - > getNumIncomingValues ( ) = = 1 )
LCSSAPhi - > addIncoming ( UndefValue : : get ( LCSSAPhi - > getType ( ) ) ,
LoopMiddleBlock ) ;
}
2015-01-18 11:17:27 -05:00
}
2013-04-08 14:41:23 -04:00
InnerLoopVectorizer : : VectorParts
InnerLoopVectorizer : : createEdgeMask ( BasicBlock * Src , BasicBlock * Dst ) {
assert ( std : : find ( pred_begin ( Dst ) , pred_end ( Dst ) , Src ) ! = pred_end ( Dst ) & &
" Invalid edge " ) ;
2013-12-21 19:04:03 -05:00
// Look for cached value.
std : : pair < BasicBlock * , BasicBlock * > Edge ( Src , Dst ) ;
EdgeMaskCache : : iterator ECEntryIt = MaskCache . find ( Edge ) ;
if ( ECEntryIt ! = MaskCache . end ( ) )
return ECEntryIt - > second ;
2013-04-08 14:41:23 -04:00
VectorParts SrcMask = createBlockInMask ( Src ) ;
// The terminator has to be a branch inst!
BranchInst * BI = dyn_cast < BranchInst > ( Src - > getTerminator ( ) ) ;
assert ( BI & & " Unexpected terminator found " ) ;
if ( BI - > isConditional ( ) ) {
VectorParts EdgeMask = getVectorValue ( BI - > getCondition ( ) ) ;
if ( BI - > getSuccessor ( 0 ) ! = Dst )
for ( unsigned part = 0 ; part < UF ; + + part )
EdgeMask [ part ] = Builder . CreateNot ( EdgeMask [ part ] ) ;
for ( unsigned part = 0 ; part < UF ; + + part )
EdgeMask [ part ] = Builder . CreateAnd ( EdgeMask [ part ] , SrcMask [ part ] ) ;
2013-12-21 19:04:03 -05:00
MaskCache [ Edge ] = EdgeMask ;
2013-04-08 14:41:23 -04:00
return EdgeMask ;
}
2013-12-21 19:04:03 -05:00
MaskCache [ Edge ] = SrcMask ;
2013-04-08 14:41:23 -04:00
return SrcMask ;
}
InnerLoopVectorizer : : VectorParts
InnerLoopVectorizer : : createBlockInMask ( BasicBlock * BB ) {
assert ( OrigLoop - > contains ( BB ) & & " Block is not a part of a loop " ) ;
// Loop incoming mask is all-one.
if ( OrigLoop - > getHeader ( ) = = BB ) {
Value * C = ConstantInt : : get ( IntegerType : : getInt1Ty ( BB - > getContext ( ) ) , 1 ) ;
return getVectorValue ( C ) ;
}
// This is the block mask. We OR all incoming edges, and with zero.
Value * Zero = ConstantInt : : get ( IntegerType : : getInt1Ty ( BB - > getContext ( ) ) , 0 ) ;
VectorParts BlockMask = getVectorValue ( Zero ) ;
// For each pred:
for ( pred_iterator it = pred_begin ( BB ) , e = pred_end ( BB ) ; it ! = e ; + + it ) {
VectorParts EM = createEdgeMask ( * it , BB ) ;
for ( unsigned part = 0 ; part < UF ; + + part )
BlockMask [ part ] = Builder . CreateOr ( BlockMask [ part ] , EM [ part ] ) ;
}
return BlockMask ;
}
2013-12-21 19:04:03 -05:00
void InnerLoopVectorizer : : widenPHIInstruction ( Instruction * PN ,
InnerLoopVectorizer : : VectorParts & Entry ,
unsigned UF , unsigned VF , PhiVector * PV ) {
PHINode * P = cast < PHINode > ( PN ) ;
// Handle reduction variables:
if ( Legal - > getReductionVars ( ) - > count ( P ) ) {
for ( unsigned part = 0 ; part < UF ; + + part ) {
// This is phase one of vectorizing PHIs.
Type * VecTy = ( VF = = 1 ) ? PN - > getType ( ) :
VectorType : : get ( PN - > getType ( ) , VF ) ;
Entry [ part ] = PHINode : : Create ( VecTy , 2 , " vec.phi " ,
2014-11-24 04:08:18 -05:00
LoopVectorBody . back ( ) - > getFirstInsertionPt ( ) ) ;
2013-12-21 19:04:03 -05:00
}
PV - > push_back ( P ) ;
return ;
}
2013-04-08 14:41:23 -04:00
2013-12-21 19:04:03 -05:00
setDebugLocFromInst ( Builder , P ) ;
// Check for PHI nodes that are lowered to vector selects.
if ( P - > getParent ( ) ! = OrigLoop - > getHeader ( ) ) {
2014-11-24 04:08:18 -05:00
// We know that all PHIs in non-header blocks are converted into
2013-12-21 19:04:03 -05:00
// selects, so we don't have to worry about the insertion order and we
// can just use the builder.
// At this point we generate the predication tree. There may be
// duplications since this is a simple recursive scan, but future
// optimizations will clean it up.
unsigned NumIncoming = P - > getNumIncomingValues ( ) ;
// Generate a sequence of selects of the form:
// SELECT(Mask3, In3,
// SELECT(Mask2, In2,
// ( ...)))
for ( unsigned In = 0 ; In < NumIncoming ; In + + ) {
VectorParts Cond = createEdgeMask ( P - > getIncomingBlock ( In ) ,
P - > getParent ( ) ) ;
VectorParts & In0 = getVectorValue ( P - > getIncomingValue ( In ) ) ;
for ( unsigned part = 0 ; part < UF ; + + part ) {
// We might have single edge PHIs (blocks) - use an identity
// 'select' for the first PHI operand.
if ( In = = 0 )
Entry [ part ] = Builder . CreateSelect ( Cond [ part ] , In0 [ part ] ,
In0 [ part ] ) ;
else
// Select between the current value and the previous incoming edge
// based on the incoming mask.
Entry [ part ] = Builder . CreateSelect ( Cond [ part ] , In0 [ part ] ,
Entry [ part ] , " predphi " ) ;
2013-04-08 14:41:23 -04:00
}
2013-12-21 19:04:03 -05:00
}
return ;
}
2013-04-08 14:41:23 -04:00
2013-12-21 19:04:03 -05:00
// This PHINode must be an induction variable.
// Make sure that we know about it.
assert ( Legal - > getInductionVars ( ) - > count ( P ) & &
" Not an induction variable " ) ;
2013-04-08 14:41:23 -04:00
2013-12-21 19:04:03 -05:00
LoopVectorizationLegality : : InductionInfo II =
Legal - > getInductionVars ( ) - > lookup ( P ) ;
2013-04-08 14:41:23 -04:00
2013-12-21 19:04:03 -05:00
switch ( II . IK ) {
case LoopVectorizationLegality : : IK_NoInduction :
llvm_unreachable ( " Unknown induction " ) ;
case LoopVectorizationLegality : : IK_IntInduction : {
assert ( P - > getType ( ) = = II . StartValue - > getType ( ) & & " Types must match " ) ;
Type * PhiTy = P - > getType ( ) ;
Value * Broadcasted ;
if ( P = = OldInduction ) {
// Handle the canonical induction variable. We might have had to
// extend the type.
Broadcasted = Builder . CreateTrunc ( Induction , PhiTy ) ;
} else {
// Handle other induction variables that are now based on the
// canonical one.
Value * NormalizedIdx = Builder . CreateSub ( Induction , ExtendedIdx ,
" normalized.idx " ) ;
NormalizedIdx = Builder . CreateSExtOrTrunc ( NormalizedIdx , PhiTy ) ;
Broadcasted = Builder . CreateAdd ( II . StartValue , NormalizedIdx ,
" offset.idx " ) ;
}
Broadcasted = getBroadcastInstrs ( Broadcasted ) ;
// After broadcasting the induction variable we need to make the vector
// consecutive by adding 0, 1, 2, etc.
for ( unsigned part = 0 ; part < UF ; + + part )
Entry [ part ] = getConsecutiveVector ( Broadcasted , VF * part , false ) ;
return ;
}
case LoopVectorizationLegality : : IK_ReverseIntInduction :
case LoopVectorizationLegality : : IK_PtrInduction :
case LoopVectorizationLegality : : IK_ReversePtrInduction :
// Handle reverse integer and pointer inductions.
Value * StartIdx = ExtendedIdx ;
// This is the normalized GEP that starts counting at zero.
Value * NormalizedIdx = Builder . CreateSub ( Induction , StartIdx ,
" normalized.idx " ) ;
// Handle the reverse integer induction variable case.
if ( LoopVectorizationLegality : : IK_ReverseIntInduction = = II . IK ) {
IntegerType * DstTy = cast < IntegerType > ( II . StartValue - > getType ( ) ) ;
Value * CNI = Builder . CreateSExtOrTrunc ( NormalizedIdx , DstTy ,
" resize.norm.idx " ) ;
Value * ReverseInd = Builder . CreateSub ( II . StartValue , CNI ,
" reverse.idx " ) ;
// This is a new value so do not hoist it out.
Value * Broadcasted = getBroadcastInstrs ( ReverseInd ) ;
2013-04-08 14:41:23 -04:00
// After broadcasting the induction variable we need to make the
2013-12-21 19:04:03 -05:00
// vector consecutive by adding ... -3, -2, -1, 0.
2013-04-08 14:41:23 -04:00
for ( unsigned part = 0 ; part < UF ; + + part )
2013-12-21 19:04:03 -05:00
Entry [ part ] = getConsecutiveVector ( Broadcasted , - ( int ) VF * part ,
true ) ;
return ;
2013-04-08 14:41:23 -04:00
}
2013-12-21 19:04:03 -05:00
// Handle the pointer induction variable case.
assert ( P - > getType ( ) - > isPointerTy ( ) & & " Unexpected type. " ) ;
// Is this a reverse induction ptr or a consecutive induction ptr.
bool Reverse = ( LoopVectorizationLegality : : IK_ReversePtrInduction = =
II . IK ) ;
// This is the vector of results. Notice that we don't generate
// vector geps because scalar geps result in better code.
for ( unsigned part = 0 ; part < UF ; + + part ) {
if ( VF = = 1 ) {
int EltIndex = ( part ) * ( Reverse ? - 1 : 1 ) ;
Constant * Idx = ConstantInt : : get ( Induction - > getType ( ) , EltIndex ) ;
Value * GlobalIdx ;
if ( Reverse )
GlobalIdx = Builder . CreateSub ( Idx , NormalizedIdx , " gep.ridx " ) ;
else
GlobalIdx = Builder . CreateAdd ( NormalizedIdx , Idx , " gep.idx " ) ;
Value * SclrGep = Builder . CreateGEP ( II . StartValue , GlobalIdx ,
" next.gep " ) ;
Entry [ part ] = SclrGep ;
2013-04-08 14:41:23 -04:00
continue ;
}
2013-12-21 19:04:03 -05:00
Value * VecVal = UndefValue : : get ( VectorType : : get ( P - > getType ( ) , VF ) ) ;
for ( unsigned int i = 0 ; i < VF ; + + i ) {
int EltIndex = ( i + part * VF ) * ( Reverse ? - 1 : 1 ) ;
Constant * Idx = ConstantInt : : get ( Induction - > getType ( ) , EltIndex ) ;
Value * GlobalIdx ;
if ( ! Reverse )
GlobalIdx = Builder . CreateAdd ( NormalizedIdx , Idx , " gep.idx " ) ;
else
GlobalIdx = Builder . CreateSub ( Idx , NormalizedIdx , " gep.ridx " ) ;
Value * SclrGep = Builder . CreateGEP ( II . StartValue , GlobalIdx ,
" next.gep " ) ;
VecVal = Builder . CreateInsertElement ( VecVal , SclrGep ,
Builder . getInt32 ( i ) ,
" insert.gep " ) ;
2013-04-08 14:41:23 -04:00
}
2013-12-21 19:04:03 -05:00
Entry [ part ] = VecVal ;
2013-04-08 14:41:23 -04:00
}
2013-12-21 19:04:03 -05:00
return ;
}
}
2013-04-08 14:41:23 -04:00
2014-11-24 04:08:18 -05:00
void InnerLoopVectorizer : : vectorizeBlockInLoop ( BasicBlock * BB , PhiVector * PV ) {
2013-12-21 19:04:03 -05:00
// For each instruction in the old loop.
for ( BasicBlock : : iterator it = BB - > begin ( ) , e = BB - > end ( ) ; it ! = e ; + + it ) {
VectorParts & Entry = WidenMap . get ( it ) ;
switch ( it - > getOpcode ( ) ) {
case Instruction : : Br :
// Nothing to do for PHIs and BR, since we already took care of the
// loop control flow instructions.
continue ;
case Instruction : : PHI : {
// Vectorize PHINodes.
2014-11-24 04:08:18 -05:00
widenPHIInstruction ( it , Entry , UF , VF , PV ) ;
2013-12-21 19:04:03 -05:00
continue ;
2013-04-08 14:41:23 -04:00
} // End of PHI.
case Instruction : : Add :
case Instruction : : FAdd :
case Instruction : : Sub :
case Instruction : : FSub :
case Instruction : : Mul :
case Instruction : : FMul :
case Instruction : : UDiv :
case Instruction : : SDiv :
case Instruction : : FDiv :
case Instruction : : URem :
case Instruction : : SRem :
case Instruction : : FRem :
case Instruction : : Shl :
case Instruction : : LShr :
case Instruction : : AShr :
case Instruction : : And :
case Instruction : : Or :
case Instruction : : Xor : {
// Just widen binops.
BinaryOperator * BinOp = dyn_cast < BinaryOperator > ( it ) ;
2013-12-21 19:04:03 -05:00
setDebugLocFromInst ( Builder , BinOp ) ;
2013-04-08 14:41:23 -04:00
VectorParts & A = getVectorValue ( it - > getOperand ( 0 ) ) ;
VectorParts & B = getVectorValue ( it - > getOperand ( 1 ) ) ;
// Use this vector value for all users of the original instruction.
for ( unsigned Part = 0 ; Part < UF ; + + Part ) {
Value * V = Builder . CreateBinOp ( BinOp - > getOpcode ( ) , A [ Part ] , B [ Part ] ) ;
2015-01-18 11:17:27 -05:00
if ( BinaryOperator * VecOp = dyn_cast < BinaryOperator > ( V ) )
VecOp - > copyIRFlags ( BinOp ) ;
2014-11-24 04:08:18 -05:00
2013-04-08 14:41:23 -04:00
Entry [ Part ] = V ;
}
2014-11-24 04:08:18 -05:00
propagateMetadata ( Entry , it ) ;
2013-04-08 14:41:23 -04:00
break ;
}
case Instruction : : Select : {
// Widen selects.
// If the selector is loop invariant we can create a select
// instruction with a scalar condition. Otherwise, use vector-select.
bool InvariantCond = SE - > isLoopInvariant ( SE - > getSCEV ( it - > getOperand ( 0 ) ) ,
OrigLoop ) ;
2013-12-21 19:04:03 -05:00
setDebugLocFromInst ( Builder , it ) ;
2013-04-08 14:41:23 -04:00
// The condition can be loop invariant but still defined inside the
// loop. This means that we can't just use the original 'cond' value.
// We have to take the 'vectorized' value and pick the first lane.
// Instcombine will make this a no-op.
VectorParts & Cond = getVectorValue ( it - > getOperand ( 0 ) ) ;
VectorParts & Op0 = getVectorValue ( it - > getOperand ( 1 ) ) ;
VectorParts & Op1 = getVectorValue ( it - > getOperand ( 2 ) ) ;
2013-12-21 19:04:03 -05:00
Value * ScalarCond = ( VF = = 1 ) ? Cond [ 0 ] :
Builder . CreateExtractElement ( Cond [ 0 ] , Builder . getInt32 ( 0 ) ) ;
2013-04-08 14:41:23 -04:00
for ( unsigned Part = 0 ; Part < UF ; + + Part ) {
Entry [ Part ] = Builder . CreateSelect (
InvariantCond ? ScalarCond : Cond [ Part ] ,
Op0 [ Part ] ,
Op1 [ Part ] ) ;
}
2014-11-24 04:08:18 -05:00
propagateMetadata ( Entry , it ) ;
2013-04-08 14:41:23 -04:00
break ;
}
case Instruction : : ICmp :
case Instruction : : FCmp : {
// Widen compares. Generate vector compares.
bool FCmp = ( it - > getOpcode ( ) = = Instruction : : FCmp ) ;
CmpInst * Cmp = dyn_cast < CmpInst > ( it ) ;
2013-12-21 19:04:03 -05:00
setDebugLocFromInst ( Builder , it ) ;
2013-04-08 14:41:23 -04:00
VectorParts & A = getVectorValue ( it - > getOperand ( 0 ) ) ;
VectorParts & B = getVectorValue ( it - > getOperand ( 1 ) ) ;
for ( unsigned Part = 0 ; Part < UF ; + + Part ) {
2014-11-24 04:08:18 -05:00
Value * C = nullptr ;
2013-04-08 14:41:23 -04:00
if ( FCmp )
C = Builder . CreateFCmp ( Cmp - > getPredicate ( ) , A [ Part ] , B [ Part ] ) ;
else
C = Builder . CreateICmp ( Cmp - > getPredicate ( ) , A [ Part ] , B [ Part ] ) ;
Entry [ Part ] = C ;
}
2014-11-24 04:08:18 -05:00
propagateMetadata ( Entry , it ) ;
2013-04-08 14:41:23 -04:00
break ;
}
case Instruction : : Store :
case Instruction : : Load :
2014-11-24 04:08:18 -05:00
vectorizeMemoryInstruction ( it ) ;
2013-04-08 14:41:23 -04:00
break ;
case Instruction : : ZExt :
case Instruction : : SExt :
case Instruction : : FPToUI :
case Instruction : : FPToSI :
case Instruction : : FPExt :
case Instruction : : PtrToInt :
case Instruction : : IntToPtr :
case Instruction : : SIToFP :
case Instruction : : UIToFP :
case Instruction : : Trunc :
case Instruction : : FPTrunc :
case Instruction : : BitCast : {
CastInst * CI = dyn_cast < CastInst > ( it ) ;
2013-12-21 19:04:03 -05:00
setDebugLocFromInst ( Builder , it ) ;
2013-04-08 14:41:23 -04:00
/// Optimize the special case where the source is the induction
/// variable. Notice that we can only optimize the 'trunc' case
/// because: a. FP conversions lose precision, b. sext/zext may wrap,
/// c. other casts depend on pointer size.
if ( CI - > getOperand ( 0 ) = = OldInduction & &
it - > getOpcode ( ) = = Instruction : : Trunc ) {
Value * ScalarCast = Builder . CreateCast ( CI - > getOpcode ( ) , Induction ,
CI - > getType ( ) ) ;
Value * Broadcasted = getBroadcastInstrs ( ScalarCast ) ;
for ( unsigned Part = 0 ; Part < UF ; + + Part )
Entry [ Part ] = getConsecutiveVector ( Broadcasted , VF * Part , false ) ;
2014-11-24 04:08:18 -05:00
propagateMetadata ( Entry , it ) ;
2013-04-08 14:41:23 -04:00
break ;
}
/// Vectorize casts.
2013-12-21 19:04:03 -05:00
Type * DestTy = ( VF = = 1 ) ? CI - > getType ( ) :
VectorType : : get ( CI - > getType ( ) , VF ) ;
2013-04-08 14:41:23 -04:00
VectorParts & A = getVectorValue ( it - > getOperand ( 0 ) ) ;
for ( unsigned Part = 0 ; Part < UF ; + + Part )
Entry [ Part ] = Builder . CreateCast ( CI - > getOpcode ( ) , A [ Part ] , DestTy ) ;
2014-11-24 04:08:18 -05:00
propagateMetadata ( Entry , it ) ;
2013-04-08 14:41:23 -04:00
break ;
}
case Instruction : : Call : {
// Ignore dbg intrinsics.
if ( isa < DbgInfoIntrinsic > ( it ) )
break ;
2013-12-21 19:04:03 -05:00
setDebugLocFromInst ( Builder , it ) ;
2013-04-08 14:41:23 -04:00
Module * M = BB - > getParent ( ) - > getParent ( ) ;
CallInst * CI = cast < CallInst > ( it ) ;
Intrinsic : : ID ID = getIntrinsicIDForCall ( CI , TLI ) ;
assert ( ID & & " Not an intrinsic call! " ) ;
2013-12-21 19:04:03 -05:00
switch ( ID ) {
2015-01-18 11:17:27 -05:00
case Intrinsic : : assume :
2013-12-21 19:04:03 -05:00
case Intrinsic : : lifetime_end :
case Intrinsic : : lifetime_start :
scalarizeInstruction ( it ) ;
break ;
default :
2014-11-24 04:08:18 -05:00
bool HasScalarOpd = hasVectorInstrinsicScalarOpd ( ID , 1 ) ;
2013-12-21 19:04:03 -05:00
for ( unsigned Part = 0 ; Part < UF ; + + Part ) {
SmallVector < Value * , 4 > Args ;
for ( unsigned i = 0 , ie = CI - > getNumArgOperands ( ) ; i ! = ie ; + + i ) {
2014-11-24 04:08:18 -05:00
if ( HasScalarOpd & & i = = 1 ) {
Args . push_back ( CI - > getArgOperand ( i ) ) ;
continue ;
}
2013-12-21 19:04:03 -05:00
VectorParts & Arg = getVectorValue ( CI - > getArgOperand ( i ) ) ;
Args . push_back ( Arg [ Part ] ) ;
}
Type * Tys [ ] = { CI - > getType ( ) } ;
if ( VF > 1 )
Tys [ 0 ] = VectorType : : get ( CI - > getType ( ) - > getScalarType ( ) , VF ) ;
Function * F = Intrinsic : : getDeclaration ( M , ID , Tys ) ;
Entry [ Part ] = Builder . CreateCall ( F , Args ) ;
2013-04-08 14:41:23 -04:00
}
2014-11-24 04:08:18 -05:00
propagateMetadata ( Entry , it ) ;
2013-12-21 19:04:03 -05:00
break ;
2013-04-08 14:41:23 -04:00
}
break ;
}
default :
// All other instructions are unsupported. Scalarize them.
scalarizeInstruction ( it ) ;
break ;
} // end of switch.
} // end of for_each instr.
2012-12-02 08:10:19 -05:00
}
2013-04-08 14:41:23 -04:00
void InnerLoopVectorizer : : updateAnalysis ( ) {
// Forget the original basic block.
2012-12-02 08:10:19 -05:00
SE - > forgetLoop ( OrigLoop ) ;
// Update the dominator tree information.
2013-04-08 14:41:23 -04:00
assert ( DT - > properlyDominates ( LoopBypassBlocks . front ( ) , LoopExitBlock ) & &
2012-12-02 08:10:19 -05:00
" Entry does not dominate exit. " ) ;
2013-04-08 14:41:23 -04:00
for ( unsigned I = 1 , E = LoopBypassBlocks . size ( ) ; I ! = E ; + + I )
DT - > addNewBlock ( LoopBypassBlocks [ I ] , LoopBypassBlocks [ I - 1 ] ) ;
DT - > addNewBlock ( LoopVectorPreHeader , LoopBypassBlocks . back ( ) ) ;
2014-11-24 04:08:18 -05:00
// Due to if predication of stores we might create a sequence of "if(pred)
// a[i] = ...; " blocks.
for ( unsigned i = 0 , e = LoopVectorBody . size ( ) ; i ! = e ; + + i ) {
if ( i = = 0 )
DT - > addNewBlock ( LoopVectorBody [ 0 ] , LoopVectorPreHeader ) ;
else if ( isPredicatedBlock ( i ) ) {
DT - > addNewBlock ( LoopVectorBody [ i ] , LoopVectorBody [ i - 1 ] ) ;
} else {
DT - > addNewBlock ( LoopVectorBody [ i ] , LoopVectorBody [ i - 2 ] ) ;
}
}
DT - > addNewBlock ( LoopMiddleBlock , LoopBypassBlocks [ 1 ] ) ;
DT - > addNewBlock ( LoopScalarPreHeader , LoopBypassBlocks [ 0 ] ) ;
2012-12-02 08:10:19 -05:00
DT - > changeImmediateDominator ( LoopScalarBody , LoopScalarPreHeader ) ;
2015-01-15 17:30:16 -05:00
DT - > changeImmediateDominator ( LoopExitBlock , LoopBypassBlocks [ 0 ] ) ;
2012-12-02 08:10:19 -05:00
2014-11-24 04:08:18 -05:00
DEBUG ( DT - > verifyDomTree ( ) ) ;
2012-12-02 08:10:19 -05:00
}
2013-12-21 19:04:03 -05:00
/// \brief Check whether it is safe to if-convert this phi node.
///
/// Phi nodes with constant expressions that can trap are not safe to if
/// convert.
static bool canIfConvertPHINodes ( BasicBlock * BB ) {
for ( BasicBlock : : iterator I = BB - > begin ( ) , E = BB - > end ( ) ; I ! = E ; + + I ) {
PHINode * Phi = dyn_cast < PHINode > ( I ) ;
if ( ! Phi )
return true ;
for ( unsigned p = 0 , e = Phi - > getNumIncomingValues ( ) ; p ! = e ; + + p )
if ( Constant * C = dyn_cast < Constant > ( Phi - > getIncomingValue ( p ) ) )
if ( C - > canTrap ( ) )
return false ;
}
return true ;
}
2013-04-08 14:41:23 -04:00
bool LoopVectorizationLegality : : canVectorizeWithIfConvert ( ) {
2014-11-24 04:08:18 -05:00
if ( ! EnableIfConversion ) {
emitAnalysis ( Report ( ) < < " if-conversion is disabled " ) ;
2013-04-08 14:41:23 -04:00
return false ;
2014-11-24 04:08:18 -05:00
}
2013-04-08 14:41:23 -04:00
assert ( TheLoop - > getNumBlocks ( ) > 1 & & " Single block loops are vectorizable " ) ;
2013-12-21 19:04:03 -05:00
// A list of pointers that we can safely read and write to.
SmallPtrSet < Value * , 8 > SafePointes ;
// Collect safe addresses.
for ( Loop : : block_iterator BI = TheLoop - > block_begin ( ) ,
BE = TheLoop - > block_end ( ) ; BI ! = BE ; + + BI ) {
BasicBlock * BB = * BI ;
if ( blockNeedsPredication ( BB ) )
continue ;
for ( BasicBlock : : iterator I = BB - > begin ( ) , E = BB - > end ( ) ; I ! = E ; + + I ) {
if ( LoadInst * LI = dyn_cast < LoadInst > ( I ) )
SafePointes . insert ( LI - > getPointerOperand ( ) ) ;
else if ( StoreInst * SI = dyn_cast < StoreInst > ( I ) )
SafePointes . insert ( SI - > getPointerOperand ( ) ) ;
}
}
2013-04-08 14:41:23 -04:00
// Collect the blocks that need predication.
2013-12-21 19:04:03 -05:00
BasicBlock * Header = TheLoop - > getHeader ( ) ;
for ( Loop : : block_iterator BI = TheLoop - > block_begin ( ) ,
BE = TheLoop - > block_end ( ) ; BI ! = BE ; + + BI ) {
BasicBlock * BB = * BI ;
2013-04-08 14:41:23 -04:00
// We don't support switch statements inside loops.
2014-11-24 04:08:18 -05:00
if ( ! isa < BranchInst > ( BB - > getTerminator ( ) ) ) {
emitAnalysis ( Report ( BB - > getTerminator ( ) )
< < " loop contains a switch statement " ) ;
2013-04-08 14:41:23 -04:00
return false ;
2014-11-24 04:08:18 -05:00
}
2013-04-08 14:41:23 -04:00
// We must be able to predicate all blocks that need to be predicated.
2013-12-21 19:04:03 -05:00
if ( blockNeedsPredication ( BB ) ) {
2014-11-24 04:08:18 -05:00
if ( ! blockCanBePredicated ( BB , SafePointes ) ) {
emitAnalysis ( Report ( BB - > getTerminator ( ) )
< < " control flow cannot be substituted for a select " ) ;
2013-12-21 19:04:03 -05:00
return false ;
2014-11-24 04:08:18 -05:00
}
} else if ( BB ! = Header & & ! canIfConvertPHINodes ( BB ) ) {
emitAnalysis ( Report ( BB - > getTerminator ( ) )
< < " control flow cannot be substituted for a select " ) ;
2013-04-08 14:41:23 -04:00
return false ;
2014-11-24 04:08:18 -05:00
}
2012-12-02 08:10:19 -05:00
}
2013-04-08 14:41:23 -04:00
// We can if-convert this loop.
return true ;
}
bool LoopVectorizationLegality : : canVectorize ( ) {
2013-06-10 16:36:52 -04:00
// We must have a loop in canonical form. Loops with indirectbr in them cannot
// be canonicalized.
2014-11-24 04:08:18 -05:00
if ( ! TheLoop - > getLoopPreheader ( ) ) {
emitAnalysis (
Report ( ) < < " loop control flow is not understood by vectorizer " ) ;
2013-06-10 16:36:52 -04:00
return false ;
2014-11-24 04:08:18 -05:00
}
2013-04-08 14:41:23 -04:00
// We can only vectorize innermost loops.
2014-11-24 04:08:18 -05:00
if ( TheLoop - > getSubLoopsVector ( ) . size ( ) ) {
emitAnalysis ( Report ( ) < < " loop is not the innermost loop " ) ;
2013-04-08 14:41:23 -04:00
return false ;
2014-11-24 04:08:18 -05:00
}
2013-04-08 14:41:23 -04:00
// We must have a single backedge.
2014-11-24 04:08:18 -05:00
if ( TheLoop - > getNumBackEdges ( ) ! = 1 ) {
emitAnalysis (
Report ( ) < < " loop control flow is not understood by vectorizer " ) ;
2013-04-08 14:41:23 -04:00
return false ;
2014-11-24 04:08:18 -05:00
}
2013-04-08 14:41:23 -04:00
// We must have a single exiting block.
2014-11-24 04:08:18 -05:00
if ( ! TheLoop - > getExitingBlock ( ) ) {
emitAnalysis (
Report ( ) < < " loop control flow is not understood by vectorizer " ) ;
2013-04-08 14:41:23 -04:00
return false ;
2014-11-24 04:08:18 -05:00
}
2013-04-08 14:41:23 -04:00
2015-01-15 17:30:16 -05:00
// We only handle bottom-tested loops, i.e. loop in which the condition is
// checked at the end of each iteration. With that we can assume that all
// instructions in the loop are executed the same number of times.
if ( TheLoop - > getExitingBlock ( ) ! = TheLoop - > getLoopLatch ( ) ) {
emitAnalysis (
Report ( ) < < " loop control flow is not understood by vectorizer " ) ;
return false ;
}
2013-12-21 19:04:03 -05:00
// We need to have a loop header.
DEBUG ( dbgs ( ) < < " LV: Found a loop: " < <
TheLoop - > getHeader ( ) - > getName ( ) < < ' \n ' ) ;
2013-04-08 14:41:23 -04:00
2014-11-24 04:08:18 -05:00
// Check if we can if-convert non-single-bb loops.
2013-12-21 19:04:03 -05:00
unsigned NumBlocks = TheLoop - > getNumBlocks ( ) ;
2013-04-08 14:41:23 -04:00
if ( NumBlocks ! = 1 & & ! canVectorizeWithIfConvert ( ) ) {
DEBUG ( dbgs ( ) < < " LV: Can't if-convert the loop. \n " ) ;
2012-12-02 08:10:19 -05:00
return false ;
}
// ScalarEvolution needs to be able to find the exit count.
2013-12-21 19:04:03 -05:00
const SCEV * ExitCount = SE - > getBackedgeTakenCount ( TheLoop ) ;
2012-12-02 08:10:19 -05:00
if ( ExitCount = = SE - > getCouldNotCompute ( ) ) {
2014-11-24 04:08:18 -05:00
emitAnalysis ( Report ( ) < < " could not determine number of loop iterations " ) ;
2012-12-02 08:10:19 -05:00
DEBUG ( dbgs ( ) < < " LV: SCEV could not compute the loop exit count. \n " ) ;
return false ;
}
2013-04-08 14:41:23 -04:00
// Check if we can vectorize the instructions and CFG in this loop.
if ( ! canVectorizeInstrs ( ) ) {
DEBUG ( dbgs ( ) < < " LV: Can't vectorize the instructions or CFG \n " ) ;
return false ;
}
2012-12-02 08:10:19 -05:00
// Go over each instruction and look at memory deps.
2013-04-08 14:41:23 -04:00
if ( ! canVectorizeMemory ( ) ) {
DEBUG ( dbgs ( ) < < " LV: Can't vectorize due to memory conflicts \n " ) ;
2012-12-02 08:10:19 -05:00
return false ;
}
2013-04-08 14:41:23 -04:00
// Collect all of the variables that remain uniform after vectorization.
collectLoopUniforms ( ) ;
2012-12-02 08:10:19 -05:00
DEBUG ( dbgs ( ) < < " LV: We can vectorize this loop " < <
( PtrRtCheck . Need ? " (with a runtime bound check) " : " " )
< < " ! \n " ) ;
// Okay! We can vectorize. At this point we don't have any other mem analysis
// which may limit our maximum vectorization factor, so just return true with
// no restrictions.
return true ;
}
2014-11-24 04:08:18 -05:00
static Type * convertPointerToIntegerType ( const DataLayout & DL , Type * Ty ) {
2013-12-21 19:04:03 -05:00
if ( Ty - > isPointerTy ( ) )
return DL . getIntPtrType ( Ty ) ;
// It is possible that char's or short's overflow when we ask for the loop's
// trip count, work around this by changing the type size.
if ( Ty - > getScalarSizeInBits ( ) < 32 )
return Type : : getInt32Ty ( Ty - > getContext ( ) ) ;
return Ty ;
}
2014-11-24 04:08:18 -05:00
static Type * getWiderType ( const DataLayout & DL , Type * Ty0 , Type * Ty1 ) {
2013-12-21 19:04:03 -05:00
Ty0 = convertPointerToIntegerType ( DL , Ty0 ) ;
Ty1 = convertPointerToIntegerType ( DL , Ty1 ) ;
if ( Ty0 - > getScalarSizeInBits ( ) > Ty1 - > getScalarSizeInBits ( ) )
return Ty0 ;
return Ty1 ;
}
2013-06-10 16:36:52 -04:00
/// \brief Check that the instruction has outside loop users and is not an
/// identified reduction variable.
static bool hasOutsideLoopUser ( const Loop * TheLoop , Instruction * Inst ,
2015-01-18 11:17:27 -05:00
SmallPtrSetImpl < Value * > & Reductions ) {
2013-06-10 16:36:52 -04:00
// Reduction instructions are allowed to have exit users. All other
// instructions must not have external users.
if ( ! Reductions . count ( Inst ) )
//Check that all of the users of the loop are inside the BB.
2014-11-24 04:08:18 -05:00
for ( User * U : Inst - > users ( ) ) {
Instruction * UI = cast < Instruction > ( U ) ;
2013-06-10 16:36:52 -04:00
// This user may be a reduction exit value.
2014-11-24 04:08:18 -05:00
if ( ! TheLoop - > contains ( UI ) ) {
DEBUG ( dbgs ( ) < < " LV: Found an outside user for : " < < * UI < < ' \n ' ) ;
2013-06-10 16:36:52 -04:00
return true ;
}
}
return false ;
}
2013-04-08 14:41:23 -04:00
bool LoopVectorizationLegality : : canVectorizeInstrs ( ) {
BasicBlock * PreHeader = TheLoop - > getLoopPreheader ( ) ;
BasicBlock * Header = TheLoop - > getHeader ( ) ;
2012-12-02 08:10:19 -05:00
2013-06-10 16:36:52 -04:00
// Look for the attribute signaling the absence of NaNs.
Function & F = * Header - > getParent ( ) ;
if ( F . hasFnAttribute ( " no-nans-fp-math " ) )
HasFunNoNaNAttr = F . getAttributes ( ) . getAttribute (
AttributeSet : : FunctionIndex ,
" no-nans-fp-math " ) . getValueAsString ( ) = = " true " ;
2013-04-08 14:41:23 -04:00
// For each block in the loop.
for ( Loop : : block_iterator bb = TheLoop - > block_begin ( ) ,
be = TheLoop - > block_end ( ) ; bb ! = be ; + + bb ) {
2012-12-02 08:10:19 -05:00
2013-04-08 14:41:23 -04:00
// Scan the instructions in the block and look for hazards.
for ( BasicBlock : : iterator it = ( * bb ) - > begin ( ) , e = ( * bb ) - > end ( ) ; it ! = e ;
+ + it ) {
if ( PHINode * Phi = dyn_cast < PHINode > ( it ) ) {
2013-12-21 19:04:03 -05:00
Type * PhiTy = Phi - > getType ( ) ;
2013-04-08 14:41:23 -04:00
// Check that this PHI type is allowed.
2013-12-21 19:04:03 -05:00
if ( ! PhiTy - > isIntegerTy ( ) & &
! PhiTy - > isFloatingPointTy ( ) & &
! PhiTy - > isPointerTy ( ) ) {
2014-11-24 04:08:18 -05:00
emitAnalysis ( Report ( it )
< < " loop control flow is not understood by vectorizer " ) ;
2013-04-08 14:41:23 -04:00
DEBUG ( dbgs ( ) < < " LV: Found an non-int non-pointer PHI. \n " ) ;
return false ;
}
2012-12-02 08:10:19 -05:00
2013-04-08 14:41:23 -04:00
// If this PHINode is not in the header block, then we know that we
// can convert it to select during if-conversion. No need to check if
// the PHIs in this block are induction or reduction variables.
2013-06-10 16:36:52 -04:00
if ( * bb ! = Header ) {
// Check that this instruction has no outside users or is an
// identified reduction value with an outside user.
2014-11-24 04:08:18 -05:00
if ( ! hasOutsideLoopUser ( TheLoop , it , AllowedExit ) )
2013-06-10 16:36:52 -04:00
continue ;
2015-01-18 11:17:27 -05:00
emitAnalysis ( Report ( it ) < < " value could not be identified as "
" an induction or reduction variable " ) ;
2013-06-10 16:36:52 -04:00
return false ;
}
2015-01-18 11:17:27 -05:00
// We only allow if-converted PHIs with exactly two incoming values.
2013-06-10 16:36:52 -04:00
if ( Phi - > getNumIncomingValues ( ) ! = 2 ) {
2014-11-24 04:08:18 -05:00
emitAnalysis ( Report ( it )
< < " control flow not understood by vectorizer " ) ;
2013-06-10 16:36:52 -04:00
DEBUG ( dbgs ( ) < < " LV: Found an invalid PHI. \n " ) ;
return false ;
}
2012-12-02 08:10:19 -05:00
2013-04-08 14:41:23 -04:00
// This is the value coming from the preheader.
Value * StartValue = Phi - > getIncomingValueForBlock ( PreHeader ) ;
// Check if this is an induction variable.
InductionKind IK = isInductionVariable ( Phi ) ;
if ( IK_NoInduction ! = IK ) {
2013-12-21 19:04:03 -05:00
// Get the widest type.
if ( ! WidestIndTy )
WidestIndTy = convertPointerToIntegerType ( * DL , PhiTy ) ;
else
WidestIndTy = getWiderType ( * DL , PhiTy , WidestIndTy ) ;
2013-04-08 14:41:23 -04:00
// Int inductions are special because we only allow one IV.
if ( IK = = IK_IntInduction ) {
2013-12-21 19:04:03 -05:00
// Use the phi node with the widest type as induction. Use the last
// one if there are multiple (no good reason for doing this other
// than it is expedient).
if ( ! Induction | | PhiTy = = WidestIndTy )
Induction = Phi ;
2013-04-08 14:41:23 -04:00
}
DEBUG ( dbgs ( ) < < " LV: Found an induction variable. \n " ) ;
Inductions [ Phi ] = InductionInfo ( StartValue , IK ) ;
2013-12-21 19:04:03 -05:00
// Until we explicitly handle the case of an induction variable with
// an outside loop user we have to give up vectorizing this loop.
2014-11-24 04:08:18 -05:00
if ( hasOutsideLoopUser ( TheLoop , it , AllowedExit ) ) {
emitAnalysis ( Report ( it ) < < " use of induction value outside of the "
" loop is not handled by vectorizer " ) ;
2013-12-21 19:04:03 -05:00
return false ;
2014-11-24 04:08:18 -05:00
}
2013-12-21 19:04:03 -05:00
2013-04-08 14:41:23 -04:00
continue ;
}
2012-12-02 08:10:19 -05:00
2013-04-08 14:41:23 -04:00
if ( AddReductionVar ( Phi , RK_IntegerAdd ) ) {
DEBUG ( dbgs ( ) < < " LV: Found an ADD reduction PHI. " < < * Phi < < " \n " ) ;
continue ;
}
if ( AddReductionVar ( Phi , RK_IntegerMult ) ) {
DEBUG ( dbgs ( ) < < " LV: Found a MUL reduction PHI. " < < * Phi < < " \n " ) ;
continue ;
}
if ( AddReductionVar ( Phi , RK_IntegerOr ) ) {
DEBUG ( dbgs ( ) < < " LV: Found an OR reduction PHI. " < < * Phi < < " \n " ) ;
continue ;
}
if ( AddReductionVar ( Phi , RK_IntegerAnd ) ) {
DEBUG ( dbgs ( ) < < " LV: Found an AND reduction PHI. " < < * Phi < < " \n " ) ;
continue ;
}
if ( AddReductionVar ( Phi , RK_IntegerXor ) ) {
DEBUG ( dbgs ( ) < < " LV: Found a XOR reduction PHI. " < < * Phi < < " \n " ) ;
continue ;
}
2013-06-10 16:36:52 -04:00
if ( AddReductionVar ( Phi , RK_IntegerMinMax ) ) {
DEBUG ( dbgs ( ) < < " LV: Found a MINMAX reduction PHI. " < < * Phi < < " \n " ) ;
continue ;
}
2013-04-08 14:41:23 -04:00
if ( AddReductionVar ( Phi , RK_FloatMult ) ) {
DEBUG ( dbgs ( ) < < " LV: Found an FMult reduction PHI. " < < * Phi < < " \n " ) ;
continue ;
}
if ( AddReductionVar ( Phi , RK_FloatAdd ) ) {
DEBUG ( dbgs ( ) < < " LV: Found an FAdd reduction PHI. " < < * Phi < < " \n " ) ;
continue ;
}
2013-06-10 16:36:52 -04:00
if ( AddReductionVar ( Phi , RK_FloatMinMax ) ) {
2013-12-21 19:04:03 -05:00
DEBUG ( dbgs ( ) < < " LV: Found an float MINMAX reduction PHI. " < < * Phi < <
" \n " ) ;
2013-06-10 16:36:52 -04:00
continue ;
}
2013-04-08 14:41:23 -04:00
2015-01-18 11:17:27 -05:00
emitAnalysis ( Report ( it ) < < " value that could not be identified as "
" reduction is used outside the loop " ) ;
2013-04-08 14:41:23 -04:00
DEBUG ( dbgs ( ) < < " LV: Found an unidentified PHI. " < < * Phi < < " \n " ) ;
return false ;
} // end of PHI handling
// We still don't handle functions. However, we can ignore dbg intrinsic
// calls and we do handle certain intrinsic and libm functions.
CallInst * CI = dyn_cast < CallInst > ( it ) ;
if ( CI & & ! getIntrinsicIDForCall ( CI , TLI ) & & ! isa < DbgInfoIntrinsic > ( CI ) ) {
2014-11-24 04:08:18 -05:00
emitAnalysis ( Report ( it ) < < " call instruction cannot be vectorized " ) ;
2013-04-08 14:41:23 -04:00
DEBUG ( dbgs ( ) < < " LV: Found a call site. \n " ) ;
return false ;
}
2014-11-24 04:08:18 -05:00
// Intrinsics such as powi,cttz and ctlz are legal to vectorize if the
// second argument is the same (i.e. loop invariant)
if ( CI & &
hasVectorInstrinsicScalarOpd ( getIntrinsicIDForCall ( CI , TLI ) , 1 ) ) {
if ( ! SE - > isLoopInvariant ( SE - > getSCEV ( CI - > getOperand ( 1 ) ) , TheLoop ) ) {
emitAnalysis ( Report ( it )
< < " intrinsic instruction cannot be vectorized " ) ;
DEBUG ( dbgs ( ) < < " LV: Found unvectorizable intrinsic " < < * CI < < " \n " ) ;
return false ;
}
}
2013-04-08 14:41:23 -04:00
// Check that the instruction return type is vectorizable.
2013-12-21 19:04:03 -05:00
// Also, we can't vectorize extractelement instructions.
if ( ( ! VectorType : : isValidElementType ( it - > getType ( ) ) & &
! it - > getType ( ) - > isVoidTy ( ) ) | | isa < ExtractElementInst > ( it ) ) {
2014-11-24 04:08:18 -05:00
emitAnalysis ( Report ( it )
< < " instruction return type cannot be vectorized " ) ;
2013-12-21 19:04:03 -05:00
DEBUG ( dbgs ( ) < < " LV: Found unvectorizable type. \n " ) ;
2013-04-08 14:41:23 -04:00
return false ;
}
// Check that the stored type is vectorizable.
if ( StoreInst * ST = dyn_cast < StoreInst > ( it ) ) {
Type * T = ST - > getValueOperand ( ) - > getType ( ) ;
2014-11-24 04:08:18 -05:00
if ( ! VectorType : : isValidElementType ( T ) ) {
emitAnalysis ( Report ( ST ) < < " store instruction cannot be vectorized " ) ;
2012-12-02 08:10:19 -05:00
return false ;
2014-11-24 04:08:18 -05:00
}
if ( EnableMemAccessVersioning )
2015-01-18 11:17:27 -05:00
collectStridedAccess ( ST ) ;
2013-04-08 14:41:23 -04:00
}
2014-11-24 04:08:18 -05:00
if ( EnableMemAccessVersioning )
if ( LoadInst * LI = dyn_cast < LoadInst > ( it ) )
2015-01-18 11:17:27 -05:00
collectStridedAccess ( LI ) ;
2014-11-24 04:08:18 -05:00
2013-04-08 14:41:23 -04:00
// Reduction instructions are allowed to have exit users.
// All other instructions must not have external users.
2014-11-24 04:08:18 -05:00
if ( hasOutsideLoopUser ( TheLoop , it , AllowedExit ) ) {
emitAnalysis ( Report ( it ) < < " value cannot be used outside the loop " ) ;
2013-06-10 16:36:52 -04:00
return false ;
2014-11-24 04:08:18 -05:00
}
2013-06-10 16:36:52 -04:00
2013-04-08 14:41:23 -04:00
} // next instr.
}
2012-12-02 08:10:19 -05:00
if ( ! Induction ) {
2013-04-08 14:41:23 -04:00
DEBUG ( dbgs ( ) < < " LV: Did not find one integer induction var. \n " ) ;
2014-11-24 04:08:18 -05:00
if ( Inductions . empty ( ) ) {
emitAnalysis ( Report ( )
< < " loop induction variable could not be identified " ) ;
2013-12-21 19:04:03 -05:00
return false ;
2014-11-24 04:08:18 -05:00
}
2012-12-02 08:10:19 -05:00
}
2013-04-08 14:41:23 -04:00
return true ;
}
2012-12-02 08:10:19 -05:00
2014-11-24 04:08:18 -05:00
///\brief Remove GEPs whose indices but the last one are loop invariant and
/// return the induction operand of the gep pointer.
static Value * stripGetElementPtr ( Value * Ptr , ScalarEvolution * SE ,
const DataLayout * DL , Loop * Lp ) {
GetElementPtrInst * GEP = dyn_cast < GetElementPtrInst > ( Ptr ) ;
if ( ! GEP )
return Ptr ;
unsigned InductionOperand = getGEPInductionOperand ( DL , GEP ) ;
// Check that all of the gep indices are uniform except for our induction
// operand.
for ( unsigned i = 0 , e = GEP - > getNumOperands ( ) ; i ! = e ; + + i )
if ( i ! = InductionOperand & &
! SE - > isLoopInvariant ( SE - > getSCEV ( GEP - > getOperand ( i ) ) , Lp ) )
return Ptr ;
return GEP - > getOperand ( InductionOperand ) ;
}
///\brief Look for a cast use of the passed value.
static Value * getUniqueCastUse ( Value * Ptr , Loop * Lp , Type * Ty ) {
Value * UniqueCast = nullptr ;
for ( User * U : Ptr - > users ( ) ) {
CastInst * CI = dyn_cast < CastInst > ( U ) ;
if ( CI & & CI - > getType ( ) = = Ty ) {
if ( ! UniqueCast )
UniqueCast = CI ;
else
return nullptr ;
}
}
return UniqueCast ;
}
///\brief Get the stride of a pointer access in a loop.
/// Looks for symbolic strides "a[i*stride]". Returns the symbolic stride as a
/// pointer to the Value, or null otherwise.
static Value * getStrideFromPointer ( Value * Ptr , ScalarEvolution * SE ,
const DataLayout * DL , Loop * Lp ) {
const PointerType * PtrTy = dyn_cast < PointerType > ( Ptr - > getType ( ) ) ;
if ( ! PtrTy | | PtrTy - > isAggregateType ( ) )
return nullptr ;
// Try to remove a gep instruction to make the pointer (actually index at this
// point) easier analyzable. If OrigPtr is equal to Ptr we are analzying the
// pointer, otherwise, we are analyzing the index.
Value * OrigPtr = Ptr ;
// The size of the pointer access.
int64_t PtrAccessSize = 1 ;
Ptr = stripGetElementPtr ( Ptr , SE , DL , Lp ) ;
const SCEV * V = SE - > getSCEV ( Ptr ) ;
if ( Ptr ! = OrigPtr )
// Strip off casts.
while ( const SCEVCastExpr * C = dyn_cast < SCEVCastExpr > ( V ) )
V = C - > getOperand ( ) ;
const SCEVAddRecExpr * S = dyn_cast < SCEVAddRecExpr > ( V ) ;
if ( ! S )
return nullptr ;
V = S - > getStepRecurrence ( * SE ) ;
if ( ! V )
return nullptr ;
// Strip off the size of access multiplication if we are still analyzing the
// pointer.
if ( OrigPtr = = Ptr ) {
DL - > getTypeAllocSize ( PtrTy - > getElementType ( ) ) ;
if ( const SCEVMulExpr * M = dyn_cast < SCEVMulExpr > ( V ) ) {
if ( M - > getOperand ( 0 ) - > getSCEVType ( ) ! = scConstant )
return nullptr ;
const APInt & APStepVal =
cast < SCEVConstant > ( M - > getOperand ( 0 ) ) - > getValue ( ) - > getValue ( ) ;
// Huge step value - give up.
if ( APStepVal . getBitWidth ( ) > 64 )
return nullptr ;
int64_t StepVal = APStepVal . getSExtValue ( ) ;
if ( PtrAccessSize ! = StepVal )
return nullptr ;
V = M - > getOperand ( 1 ) ;
}
}
// Strip off casts.
Type * StripedOffRecurrenceCast = nullptr ;
if ( const SCEVCastExpr * C = dyn_cast < SCEVCastExpr > ( V ) ) {
StripedOffRecurrenceCast = C - > getType ( ) ;
V = C - > getOperand ( ) ;
}
// Look for the loop invariant symbolic value.
const SCEVUnknown * U = dyn_cast < SCEVUnknown > ( V ) ;
if ( ! U )
return nullptr ;
Value * Stride = U - > getValue ( ) ;
if ( ! Lp - > isLoopInvariant ( Stride ) )
return nullptr ;
// If we have stripped off the recurrence cast we have to make sure that we
// return the value that is used in this loop so that we can replace it later.
if ( StripedOffRecurrenceCast )
Stride = getUniqueCastUse ( Stride , Lp , StripedOffRecurrenceCast ) ;
return Stride ;
}
2015-01-18 11:17:27 -05:00
void LoopVectorizationLegality : : collectStridedAccess ( Value * MemAccess ) {
2014-11-24 04:08:18 -05:00
Value * Ptr = nullptr ;
if ( LoadInst * LI = dyn_cast < LoadInst > ( MemAccess ) )
Ptr = LI - > getPointerOperand ( ) ;
else if ( StoreInst * SI = dyn_cast < StoreInst > ( MemAccess ) )
Ptr = SI - > getPointerOperand ( ) ;
else
return ;
Value * Stride = getStrideFromPointer ( Ptr , SE , DL , TheLoop ) ;
if ( ! Stride )
return ;
DEBUG ( dbgs ( ) < < " LV: Found a strided access that we can version " ) ;
DEBUG ( dbgs ( ) < < " Ptr: " < < * Ptr < < " Stride: " < < * Stride < < " \n " ) ;
Strides [ Ptr ] = Stride ;
StrideSet . insert ( Stride ) ;
}
2013-04-08 14:41:23 -04:00
void LoopVectorizationLegality : : collectLoopUniforms ( ) {
2012-12-02 08:10:19 -05:00
// We now know that the loop is vectorizable!
// Collect variables that will remain uniform after vectorization.
std : : vector < Value * > Worklist ;
2013-04-08 14:41:23 -04:00
BasicBlock * Latch = TheLoop - > getLoopLatch ( ) ;
2012-12-02 08:10:19 -05:00
// Start with the conditional branch and walk up the block.
2013-04-08 14:41:23 -04:00
Worklist . push_back ( Latch - > getTerminator ( ) - > getOperand ( 0 ) ) ;
2012-12-02 08:10:19 -05:00
2014-11-24 04:08:18 -05:00
// Also add all consecutive pointer values; these values will be uniform
// after vectorization (and subsequent cleanup) and, until revectorization is
// supported, all dependencies must also be uniform.
for ( Loop : : block_iterator B = TheLoop - > block_begin ( ) ,
BE = TheLoop - > block_end ( ) ; B ! = BE ; + + B )
for ( BasicBlock : : iterator I = ( * B ) - > begin ( ) , IE = ( * B ) - > end ( ) ;
I ! = IE ; + + I )
if ( I - > getType ( ) - > isPointerTy ( ) & & isConsecutivePtr ( I ) )
Worklist . insert ( Worklist . end ( ) , I - > op_begin ( ) , I - > op_end ( ) ) ;
2012-12-02 08:10:19 -05:00
while ( Worklist . size ( ) ) {
Instruction * I = dyn_cast < Instruction > ( Worklist . back ( ) ) ;
Worklist . pop_back ( ) ;
2013-04-08 14:41:23 -04:00
// Look at instructions inside this loop.
2012-12-02 08:10:19 -05:00
// Stop when reaching PHI nodes.
2013-04-08 14:41:23 -04:00
// TODO: we need to follow values all over the loop, not only in this block.
if ( ! I | | ! TheLoop - > contains ( I ) | | isa < PHINode > ( I ) )
continue ;
2012-12-02 08:10:19 -05:00
// This is a known uniform.
Uniforms . insert ( I ) ;
// Insert all operands.
2013-12-21 19:04:03 -05:00
Worklist . insert ( Worklist . end ( ) , I - > op_begin ( ) , I - > op_end ( ) ) ;
2012-12-02 08:10:19 -05:00
}
2013-04-08 14:41:23 -04:00
}
2012-12-02 08:10:19 -05:00
2013-12-21 19:04:03 -05:00
namespace {
/// \brief Analyses memory accesses in a loop.
///
/// Checks whether run time pointer checks are needed and builds sets for data
/// dependence checking.
class AccessAnalysis {
public :
/// \brief Read or write access location.
typedef PointerIntPair < Value * , 1 , bool > MemAccessInfo ;
typedef SmallPtrSet < MemAccessInfo , 8 > MemAccessInfoSet ;
/// \brief Set of potential dependent memory accesses.
typedef EquivalenceClasses < MemAccessInfo > DepCandidates ;
2014-11-24 04:08:18 -05:00
AccessAnalysis ( const DataLayout * Dl , AliasAnalysis * AA , DepCandidates & DA ) :
DL ( Dl ) , AST ( * AA ) , DepCands ( DA ) , IsRTCheckNeeded ( false ) { }
2013-12-21 19:04:03 -05:00
/// \brief Register a load and whether it is only read from.
2014-11-24 04:08:18 -05:00
void addLoad ( AliasAnalysis : : Location & Loc , bool IsReadOnly ) {
Value * Ptr = const_cast < Value * > ( Loc . Ptr ) ;
2015-01-18 11:17:27 -05:00
AST . add ( Ptr , AliasAnalysis : : UnknownSize , Loc . AATags ) ;
2013-12-21 19:04:03 -05:00
Accesses . insert ( MemAccessInfo ( Ptr , false ) ) ;
if ( IsReadOnly )
ReadOnlyPtr . insert ( Ptr ) ;
}
/// \brief Register a store.
2014-11-24 04:08:18 -05:00
void addStore ( AliasAnalysis : : Location & Loc ) {
Value * Ptr = const_cast < Value * > ( Loc . Ptr ) ;
2015-01-18 11:17:27 -05:00
AST . add ( Ptr , AliasAnalysis : : UnknownSize , Loc . AATags ) ;
2013-12-21 19:04:03 -05:00
Accesses . insert ( MemAccessInfo ( Ptr , true ) ) ;
}
/// \brief Check whether we can check the pointers at runtime for
/// non-intersection.
bool canCheckPtrAtRT ( LoopVectorizationLegality : : RuntimePointerCheck & RtCheck ,
unsigned & NumComparisons , ScalarEvolution * SE ,
2014-11-24 04:08:18 -05:00
Loop * TheLoop , ValueToValueMap & Strides ,
bool ShouldCheckStride = false ) ;
2013-12-21 19:04:03 -05:00
/// \brief Goes over all memory accesses, checks whether a RT check is needed
/// and builds sets of dependent accesses.
void buildDependenceSets ( ) {
2014-11-24 04:08:18 -05:00
processMemAccesses ( ) ;
2013-12-21 19:04:03 -05:00
}
bool isRTCheckNeeded ( ) { return IsRTCheckNeeded ; }
bool isDependencyCheckNeeded ( ) { return ! CheckDeps . empty ( ) ; }
void resetDepChecks ( ) { CheckDeps . clear ( ) ; }
MemAccessInfoSet & getDependenciesToCheck ( ) { return CheckDeps ; }
private :
typedef SetVector < MemAccessInfo > PtrAccessSet ;
2014-11-24 04:08:18 -05:00
/// \brief Go over all memory access and check whether runtime pointer checks
/// are needed /// and build sets of dependency check candidates.
void processMemAccesses ( ) ;
2013-12-21 19:04:03 -05:00
/// Set of all accesses.
PtrAccessSet Accesses ;
/// Set of accesses that need a further dependence check.
MemAccessInfoSet CheckDeps ;
/// Set of pointers that are read only.
SmallPtrSet < Value * , 16 > ReadOnlyPtr ;
2013-04-08 14:41:23 -04:00
2014-11-24 04:08:18 -05:00
const DataLayout * DL ;
2013-12-21 19:04:03 -05:00
2014-11-24 04:08:18 -05:00
/// An alias set tracker to partition the access set by underlying object and
//intrinsic property (such as TBAA metadata).
AliasSetTracker AST ;
2013-12-21 19:04:03 -05:00
/// Sets of potentially dependent accesses - members of one set share an
/// underlying pointer. The set "CheckDeps" identfies which sets really need a
/// dependence check.
DepCandidates & DepCands ;
bool IsRTCheckNeeded ;
} ;
} // end anonymous namespace
/// \brief Check whether a pointer can participate in a runtime bounds check.
2014-11-24 04:08:18 -05:00
static bool hasComputableBounds ( ScalarEvolution * SE , ValueToValueMap & Strides ,
Value * Ptr ) {
const SCEV * PtrScev = replaceSymbolicStrideSCEV ( SE , Strides , Ptr ) ;
2013-12-21 19:04:03 -05:00
const SCEVAddRecExpr * AR = dyn_cast < SCEVAddRecExpr > ( PtrScev ) ;
if ( ! AR )
return false ;
return AR - > isAffine ( ) ;
2012-12-02 08:10:19 -05:00
}
2013-12-21 19:04:03 -05:00
/// \brief Check the stride of the pointer and ensure that it does not wrap in
/// the address space.
2014-11-24 04:08:18 -05:00
static int isStridedPtr ( ScalarEvolution * SE , const DataLayout * DL , Value * Ptr ,
const Loop * Lp , ValueToValueMap & StridesMap ) ;
2013-04-08 14:41:23 -04:00
2013-12-21 19:04:03 -05:00
bool AccessAnalysis : : canCheckPtrAtRT (
2014-11-24 04:08:18 -05:00
LoopVectorizationLegality : : RuntimePointerCheck & RtCheck ,
unsigned & NumComparisons , ScalarEvolution * SE , Loop * TheLoop ,
ValueToValueMap & StridesMap , bool ShouldCheckStride ) {
2013-12-21 19:04:03 -05:00
// Find pointers with computable bounds. We are going to use this information
// to place a runtime bound check.
bool CanDoRT = true ;
bool IsDepCheckNeeded = isDependencyCheckNeeded ( ) ;
2014-11-24 04:08:18 -05:00
NumComparisons = 0 ;
2013-04-08 14:41:23 -04:00
2014-11-24 04:08:18 -05:00
// We assign a consecutive id to access from different alias sets.
// Accesses between different groups doesn't need to be checked.
unsigned ASId = 1 ;
for ( auto & AS : AST ) {
unsigned NumReadPtrChecks = 0 ;
unsigned NumWritePtrChecks = 0 ;
// We assign consecutive id to access from different dependence sets.
// Accesses within the same set don't need a runtime check.
unsigned RunningDepId = 1 ;
DenseMap < Value * , unsigned > DepSetId ;
for ( auto A : AS ) {
Value * Ptr = A . getValue ( ) ;
bool IsWrite = Accesses . count ( MemAccessInfo ( Ptr , true ) ) ;
MemAccessInfo Access ( Ptr , IsWrite ) ;
if ( IsWrite )
+ + NumWritePtrChecks ;
else
+ + NumReadPtrChecks ;
if ( hasComputableBounds ( SE , StridesMap , Ptr ) & &
// When we run after a failing dependency check we have to make sure we
// don't have wrapping pointers.
( ! ShouldCheckStride | |
isStridedPtr ( SE , DL , Ptr , TheLoop , StridesMap ) = = 1 ) ) {
// The id of the dependence set.
unsigned DepId ;
if ( IsDepCheckNeeded ) {
Value * Leader = DepCands . getLeaderValue ( Access ) . getPointer ( ) ;
unsigned & LeaderId = DepSetId [ Leader ] ;
if ( ! LeaderId )
LeaderId = RunningDepId + + ;
DepId = LeaderId ;
} else
// Each access has its own dependence set.
DepId = RunningDepId + + ;
RtCheck . insert ( SE , TheLoop , Ptr , IsWrite , DepId , ASId , StridesMap ) ;
DEBUG ( dbgs ( ) < < " LV: Found a runtime check ptr: " < < * Ptr < < ' \n ' ) ;
} else {
CanDoRT = false ;
}
2013-12-21 19:04:03 -05:00
}
2014-11-24 04:08:18 -05:00
if ( IsDepCheckNeeded & & CanDoRT & & RunningDepId = = 2 )
NumComparisons + = 0 ; // Only one dependence set.
else {
NumComparisons + = ( NumWritePtrChecks * ( NumReadPtrChecks +
NumWritePtrChecks - 1 ) ) ;
}
+ + ASId ;
2013-12-21 19:04:03 -05:00
}
// If the pointers that we would use for the bounds comparison have different
// address spaces, assume the values aren't directly comparable, so we can't
// use them for the runtime check. We also have to assume they could
// overlap. In the future there should be metadata for whether address spaces
// are disjoint.
unsigned NumPointers = RtCheck . Pointers . size ( ) ;
for ( unsigned i = 0 ; i < NumPointers ; + + i ) {
for ( unsigned j = i + 1 ; j < NumPointers ; + + j ) {
// Only need to check pointers between two different dependency sets.
if ( RtCheck . DependencySetId [ i ] = = RtCheck . DependencySetId [ j ] )
continue ;
2014-11-24 04:08:18 -05:00
// Only need to check pointers in the same alias set.
if ( RtCheck . AliasSetId [ i ] ! = RtCheck . AliasSetId [ j ] )
continue ;
2013-12-21 19:04:03 -05:00
Value * PtrI = RtCheck . Pointers [ i ] ;
Value * PtrJ = RtCheck . Pointers [ j ] ;
unsigned ASi = PtrI - > getType ( ) - > getPointerAddressSpace ( ) ;
unsigned ASj = PtrJ - > getType ( ) - > getPointerAddressSpace ( ) ;
if ( ASi ! = ASj ) {
DEBUG ( dbgs ( ) < < " LV: Runtime check would require comparison between "
" different address spaces \n " ) ;
return false ;
}
}
}
return CanDoRT ;
}
2014-11-24 04:08:18 -05:00
void AccessAnalysis : : processMemAccesses ( ) {
2013-12-21 19:04:03 -05:00
// We process the set twice: first we process read-write pointers, last we
// process read-only pointers. This allows us to skip dependence tests for
// read-only pointers.
2014-11-24 04:08:18 -05:00
DEBUG ( dbgs ( ) < < " LV: Processing memory accesses... \n " ) ;
DEBUG ( dbgs ( ) < < " AST: " ; AST . dump ( ) ) ;
DEBUG ( dbgs ( ) < < " LV: Accesses: \n " ) ;
DEBUG ( {
for ( auto A : Accesses )
dbgs ( ) < < " \t " < < * A . getPointer ( ) < < " ( " < <
( A . getInt ( ) ? " write " : ( ReadOnlyPtr . count ( A . getPointer ( ) ) ?
" read-only " : " read " ) ) < < " ) \n " ;
} ) ;
// The AliasSetTracker has nicely partitioned our pointers by metadata
// compatibility and potential for underlying-object overlap. As a result, we
// only need to check for potential pointer dependencies within each alias
// set.
for ( auto & AS : AST ) {
// Note that both the alias-set tracker and the alias sets themselves used
// linked lists internally and so the iteration order here is deterministic
// (matching the original instruction order within each set).
bool SetHasWrite = false ;
// Map of pointers to last access encountered.
typedef DenseMap < Value * , MemAccessInfo > UnderlyingObjToAccessMap ;
UnderlyingObjToAccessMap ObjToLastAccess ;
// Set of access to check after all writes have been processed.
PtrAccessSet DeferredAccesses ;
// Iterate over each alias set twice, once to process read/write pointers,
// and then to process read-only pointers.
for ( int SetIteration = 0 ; SetIteration < 2 ; + + SetIteration ) {
bool UseDeferred = SetIteration > 0 ;
PtrAccessSet & S = UseDeferred ? DeferredAccesses : Accesses ;
2015-01-18 11:17:27 -05:00
for ( auto AV : AS ) {
Value * Ptr = AV . getValue ( ) ;
2014-11-24 04:08:18 -05:00
2015-01-18 11:17:27 -05:00
// For a single memory access in AliasSetTracker, Accesses may contain
// both read and write, and they both need to be handled for CheckDeps.
for ( auto AC : S ) {
if ( AC . getPointer ( ) ! = Ptr )
continue ;
2013-12-21 19:04:03 -05:00
2015-01-18 11:17:27 -05:00
bool IsWrite = AC . getInt ( ) ;
// If we're using the deferred access set, then it contains only
// reads.
bool IsReadOnlyPtr = ReadOnlyPtr . count ( Ptr ) & & ! IsWrite ;
if ( UseDeferred & & ! IsReadOnlyPtr )
continue ;
// Otherwise, the pointer must be in the PtrAccessSet, either as a
// read or a write.
assert ( ( ( IsReadOnlyPtr & & UseDeferred ) | | IsWrite | |
S . count ( MemAccessInfo ( Ptr , false ) ) ) & &
" Alias-set pointer not in the access set? " ) ;
MemAccessInfo Access ( Ptr , IsWrite ) ;
DepCands . insert ( Access ) ;
// Memorize read-only pointers for later processing and skip them in
// the first round (they need to be checked after we have seen all
// write pointers). Note: we also mark pointer that are not
// consecutive as "read-only" pointers (so that we check
// "a[b[i]] +="). Hence, we need the second check for "!IsWrite".
if ( ! UseDeferred & & IsReadOnlyPtr ) {
DeferredAccesses . insert ( Access ) ;
continue ;
}
// If this is a write - check other reads and writes for conflicts. If
// this is a read only check other writes for conflicts (but only if
// there is no other write to the ptr - this is an optimization to
// catch "a[i] = a[i] + " without having to do a dependence check).
if ( ( IsWrite | | IsReadOnlyPtr ) & & SetHasWrite ) {
CheckDeps . insert ( Access ) ;
IsRTCheckNeeded = true ;
}
2013-12-21 19:04:03 -05:00
2015-01-18 11:17:27 -05:00
if ( IsWrite )
SetHasWrite = true ;
// Create sets of pointers connected by a shared alias set and
// underlying object.
typedef SmallVector < Value * , 16 > ValueVector ;
ValueVector TempObjects ;
GetUnderlyingObjects ( Ptr , TempObjects , DL ) ;
for ( Value * UnderlyingObj : TempObjects ) {
UnderlyingObjToAccessMap : : iterator Prev =
ObjToLastAccess . find ( UnderlyingObj ) ;
if ( Prev ! = ObjToLastAccess . end ( ) )
DepCands . unionSets ( Access , Prev - > second ) ;
ObjToLastAccess [ UnderlyingObj ] = Access ;
}
2014-11-24 04:08:18 -05:00
}
2013-12-21 19:04:03 -05:00
}
}
}
}
namespace {
/// \brief Checks memory dependences among accesses to the same underlying
/// object to determine whether there vectorization is legal or not (and at
/// which vectorization factor).
///
/// This class works under the assumption that we already checked that memory
/// locations with different underlying pointers are "must-not alias".
/// We use the ScalarEvolution framework to symbolically evalutate access
/// functions pairs. Since we currently don't restructure the loop we can rely
/// on the program order of memory accesses to determine their safety.
/// At the moment we will only deem accesses as safe for:
/// * A negative constant distance assuming program order.
///
/// Safe: tmp = a[i + 1]; OR a[i + 1] = x;
/// a[i] = tmp; y = a[i];
///
/// The latter case is safe because later checks guarantuee that there can't
/// be a cycle through a phi node (that is, we check that "x" and "y" is not
/// the same variable: a header phi can only be an induction or a reduction, a
/// reduction can't have a memory sink, an induction can't have a memory
/// source). This is important and must not be violated (or we have to
/// resort to checking for cycles through memory).
///
/// * A positive constant distance assuming program order that is bigger
/// than the biggest memory access.
///
/// tmp = a[i] OR b[i] = x
/// a[i+2] = tmp y = b[i+2];
///
/// Safe distance: 2 x sizeof(a[0]), and 2 x sizeof(b[0]), respectively.
///
/// * Zero distances and all accesses have the same size.
///
class MemoryDepChecker {
public :
typedef PointerIntPair < Value * , 1 , bool > MemAccessInfo ;
typedef SmallPtrSet < MemAccessInfo , 8 > MemAccessInfoSet ;
2014-11-24 04:08:18 -05:00
MemoryDepChecker ( ScalarEvolution * Se , const DataLayout * Dl , const Loop * L )
2013-12-21 19:04:03 -05:00
: SE ( Se ) , DL ( Dl ) , InnermostLoop ( L ) , AccessIdx ( 0 ) ,
ShouldRetryWithRuntimeCheck ( false ) { }
/// \brief Register the location (instructions are given increasing numbers)
/// of a write access.
void addAccess ( StoreInst * SI ) {
Value * Ptr = SI - > getPointerOperand ( ) ;
Accesses [ MemAccessInfo ( Ptr , true ) ] . push_back ( AccessIdx ) ;
InstMap . push_back ( SI ) ;
+ + AccessIdx ;
}
/// \brief Register the location (instructions are given increasing numbers)
/// of a write access.
void addAccess ( LoadInst * LI ) {
Value * Ptr = LI - > getPointerOperand ( ) ;
Accesses [ MemAccessInfo ( Ptr , false ) ] . push_back ( AccessIdx ) ;
InstMap . push_back ( LI ) ;
+ + AccessIdx ;
}
/// \brief Check whether the dependencies between the accesses are safe.
///
/// Only checks sets with elements in \p CheckDeps.
bool areDepsSafe ( AccessAnalysis : : DepCandidates & AccessSets ,
2014-11-24 04:08:18 -05:00
MemAccessInfoSet & CheckDeps , ValueToValueMap & Strides ) ;
2013-12-21 19:04:03 -05:00
/// \brief The maximum number of bytes of a vector register we can vectorize
/// the accesses safely with.
unsigned getMaxSafeDepDistBytes ( ) { return MaxSafeDepDistBytes ; }
/// \brief In same cases when the dependency check fails we can still
/// vectorize the loop with a dynamic array access check.
bool shouldRetryWithRuntimeCheck ( ) { return ShouldRetryWithRuntimeCheck ; }
private :
ScalarEvolution * SE ;
2014-11-24 04:08:18 -05:00
const DataLayout * DL ;
2013-12-21 19:04:03 -05:00
const Loop * InnermostLoop ;
/// \brief Maps access locations (ptr, read/write) to program order.
DenseMap < MemAccessInfo , std : : vector < unsigned > > Accesses ;
/// \brief Memory access instructions in program order.
SmallVector < Instruction * , 16 > InstMap ;
/// \brief The program order index to be used for the next instruction.
unsigned AccessIdx ;
// We can access this many bytes in parallel safely.
unsigned MaxSafeDepDistBytes ;
2014-11-24 04:08:18 -05:00
/// \brief If we see a non-constant dependence distance we can still try to
2013-12-21 19:04:03 -05:00
/// vectorize this loop with runtime checks.
bool ShouldRetryWithRuntimeCheck ;
/// \brief Check whether there is a plausible dependence between the two
/// accesses.
///
/// Access \p A must happen before \p B in program order. The two indices
/// identify the index into the program order map.
///
/// This function checks whether there is a plausible dependence (or the
/// absence of such can't be proved) between the two accesses. If there is a
/// plausible dependence but the dependence distance is bigger than one
/// element access it records this distance in \p MaxSafeDepDistBytes (if this
/// distance is smaller than any other distance encountered so far).
/// Otherwise, this function returns true signaling a possible dependence.
bool isDependent ( const MemAccessInfo & A , unsigned AIdx ,
2014-11-24 04:08:18 -05:00
const MemAccessInfo & B , unsigned BIdx ,
ValueToValueMap & Strides ) ;
2013-12-21 19:04:03 -05:00
/// \brief Check whether the data dependence could prevent store-load
/// forwarding.
bool couldPreventStoreLoadForward ( unsigned Distance , unsigned TypeByteSize ) ;
} ;
} // end anonymous namespace
static bool isInBoundsGep ( Value * Ptr ) {
if ( GetElementPtrInst * GEP = dyn_cast < GetElementPtrInst > ( Ptr ) )
return GEP - > isInBounds ( ) ;
return false ;
}
/// \brief Check whether the access through \p Ptr has a constant stride.
2014-11-24 04:08:18 -05:00
static int isStridedPtr ( ScalarEvolution * SE , const DataLayout * DL , Value * Ptr ,
const Loop * Lp , ValueToValueMap & StridesMap ) {
2013-12-21 19:04:03 -05:00
const Type * Ty = Ptr - > getType ( ) ;
2014-11-24 04:08:18 -05:00
assert ( Ty - > isPointerTy ( ) & & " Unexpected non-ptr " ) ;
2013-12-21 19:04:03 -05:00
// Make sure that the pointer does not point to aggregate types.
const PointerType * PtrTy = cast < PointerType > ( Ty ) ;
if ( PtrTy - > getElementType ( ) - > isAggregateType ( ) ) {
DEBUG ( dbgs ( ) < < " LV: Bad stride - Not a pointer to a scalar type " < < * Ptr < <
" \n " ) ;
return 0 ;
}
2014-11-24 04:08:18 -05:00
const SCEV * PtrScev = replaceSymbolicStrideSCEV ( SE , StridesMap , Ptr ) ;
2013-12-21 19:04:03 -05:00
const SCEVAddRecExpr * AR = dyn_cast < SCEVAddRecExpr > ( PtrScev ) ;
if ( ! AR ) {
DEBUG ( dbgs ( ) < < " LV: Bad stride - Not an AddRecExpr pointer "
< < * Ptr < < " SCEV: " < < * PtrScev < < " \n " ) ;
return 0 ;
}
// The accesss function must stride over the innermost loop.
if ( Lp ! = AR - > getLoop ( ) ) {
DEBUG ( dbgs ( ) < < " LV: Bad stride - Not striding over innermost loop " < <
* Ptr < < " SCEV: " < < * PtrScev < < " \n " ) ;
}
// The address calculation must not wrap. Otherwise, a dependence could be
// inverted.
// An inbounds getelementptr that is a AddRec with a unit stride
// cannot wrap per definition. The unit stride requirement is checked later.
// An getelementptr without an inbounds attribute and unit stride would have
// to access the pointer value "0" which is undefined behavior in address
// space 0, therefore we can also vectorize this case.
bool IsInBoundsGEP = isInBoundsGep ( Ptr ) ;
bool IsNoWrapAddRec = AR - > getNoWrapFlags ( SCEV : : NoWrapMask ) ;
bool IsInAddressSpaceZero = PtrTy - > getAddressSpace ( ) = = 0 ;
if ( ! IsNoWrapAddRec & & ! IsInBoundsGEP & & ! IsInAddressSpaceZero ) {
DEBUG ( dbgs ( ) < < " LV: Bad stride - Pointer may wrap in the address space "
< < * Ptr < < " SCEV: " < < * PtrScev < < " \n " ) ;
return 0 ;
}
// Check the step is constant.
const SCEV * Step = AR - > getStepRecurrence ( * SE ) ;
// Calculate the pointer stride and check if it is consecutive.
const SCEVConstant * C = dyn_cast < SCEVConstant > ( Step ) ;
if ( ! C ) {
DEBUG ( dbgs ( ) < < " LV: Bad stride - Not a constant strided " < < * Ptr < <
" SCEV: " < < * PtrScev < < " \n " ) ;
return 0 ;
}
int64_t Size = DL - > getTypeAllocSize ( PtrTy - > getElementType ( ) ) ;
const APInt & APStepVal = C - > getValue ( ) - > getValue ( ) ;
// Huge step value - give up.
if ( APStepVal . getBitWidth ( ) > 64 )
return 0 ;
int64_t StepVal = APStepVal . getSExtValue ( ) ;
// Strided access.
int64_t Stride = StepVal / Size ;
int64_t Rem = StepVal % Size ;
if ( Rem )
return 0 ;
// If the SCEV could wrap but we have an inbounds gep with a unit stride we
// know we can't "wrap around the address space". In case of address space
// zero we know that this won't happen without triggering undefined behavior.
if ( ! IsNoWrapAddRec & & ( IsInBoundsGEP | | IsInAddressSpaceZero ) & &
Stride ! = 1 & & Stride ! = - 1 )
return 0 ;
return Stride ;
}
bool MemoryDepChecker : : couldPreventStoreLoadForward ( unsigned Distance ,
unsigned TypeByteSize ) {
// If loads occur at a distance that is not a multiple of a feasible vector
// factor store-load forwarding does not take place.
// Positive dependences might cause troubles because vectorizing them might
// prevent store-load forwarding making vectorized code run a lot slower.
// a[i] = a[i-3] ^ a[i-8];
// The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and
// hence on your typical architecture store-load forwarding does not take
// place. Vectorizing in such cases does not make sense.
// Store-load forwarding distance.
const unsigned NumCyclesForStoreLoadThroughMemory = 8 * TypeByteSize ;
// Maximum vector factor.
unsigned MaxVFWithoutSLForwardIssues = MaxVectorWidth * TypeByteSize ;
if ( MaxSafeDepDistBytes < MaxVFWithoutSLForwardIssues )
MaxVFWithoutSLForwardIssues = MaxSafeDepDistBytes ;
for ( unsigned vf = 2 * TypeByteSize ; vf < = MaxVFWithoutSLForwardIssues ;
vf * = 2 ) {
if ( Distance % vf & & Distance / vf < NumCyclesForStoreLoadThroughMemory ) {
MaxVFWithoutSLForwardIssues = ( vf > > = 1 ) ;
break ;
}
}
if ( MaxVFWithoutSLForwardIssues < 2 * TypeByteSize ) {
DEBUG ( dbgs ( ) < < " LV: Distance " < < Distance < <
" that could cause a store-load forwarding conflict \n " ) ;
return true ;
}
if ( MaxVFWithoutSLForwardIssues < MaxSafeDepDistBytes & &
MaxVFWithoutSLForwardIssues ! = MaxVectorWidth * TypeByteSize )
MaxSafeDepDistBytes = MaxVFWithoutSLForwardIssues ;
return false ;
}
bool MemoryDepChecker : : isDependent ( const MemAccessInfo & A , unsigned AIdx ,
2014-11-24 04:08:18 -05:00
const MemAccessInfo & B , unsigned BIdx ,
ValueToValueMap & Strides ) {
2013-12-21 19:04:03 -05:00
assert ( AIdx < BIdx & & " Must pass arguments in program order " ) ;
Value * APtr = A . getPointer ( ) ;
Value * BPtr = B . getPointer ( ) ;
bool AIsWrite = A . getInt ( ) ;
bool BIsWrite = B . getInt ( ) ;
// Two reads are independent.
if ( ! AIsWrite & & ! BIsWrite )
return false ;
2014-11-24 04:08:18 -05:00
// We cannot check pointers in different address spaces.
if ( APtr - > getType ( ) - > getPointerAddressSpace ( ) ! =
BPtr - > getType ( ) - > getPointerAddressSpace ( ) )
return true ;
const SCEV * AScev = replaceSymbolicStrideSCEV ( SE , Strides , APtr ) ;
const SCEV * BScev = replaceSymbolicStrideSCEV ( SE , Strides , BPtr ) ;
2013-12-21 19:04:03 -05:00
2014-11-24 04:08:18 -05:00
int StrideAPtr = isStridedPtr ( SE , DL , APtr , InnermostLoop , Strides ) ;
int StrideBPtr = isStridedPtr ( SE , DL , BPtr , InnermostLoop , Strides ) ;
2013-12-21 19:04:03 -05:00
const SCEV * Src = AScev ;
const SCEV * Sink = BScev ;
// If the induction step is negative we have to invert source and sink of the
// dependence.
if ( StrideAPtr < 0 ) {
//Src = BScev;
//Sink = AScev;
std : : swap ( APtr , BPtr ) ;
std : : swap ( Src , Sink ) ;
std : : swap ( AIsWrite , BIsWrite ) ;
std : : swap ( AIdx , BIdx ) ;
std : : swap ( StrideAPtr , StrideBPtr ) ;
}
const SCEV * Dist = SE - > getMinusSCEV ( Sink , Src ) ;
DEBUG ( dbgs ( ) < < " LV: Src Scev: " < < * Src < < " Sink Scev: " < < * Sink
< < " (Induction step: " < < StrideAPtr < < " ) \n " ) ;
DEBUG ( dbgs ( ) < < " LV: Distance for " < < * InstMap [ AIdx ] < < " to "
< < * InstMap [ BIdx ] < < " : " < < * Dist < < " \n " ) ;
// Need consecutive accesses. We don't want to vectorize
// "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in
// the address space.
if ( ! StrideAPtr | | ! StrideBPtr | | StrideAPtr ! = StrideBPtr ) {
DEBUG ( dbgs ( ) < < " Non-consecutive pointer access \n " ) ;
return true ;
}
const SCEVConstant * C = dyn_cast < SCEVConstant > ( Dist ) ;
if ( ! C ) {
2014-11-24 04:08:18 -05:00
DEBUG ( dbgs ( ) < < " LV: Dependence because of non-constant distance \n " ) ;
2013-12-21 19:04:03 -05:00
ShouldRetryWithRuntimeCheck = true ;
return true ;
}
Type * ATy = APtr - > getType ( ) - > getPointerElementType ( ) ;
Type * BTy = BPtr - > getType ( ) - > getPointerElementType ( ) ;
unsigned TypeByteSize = DL - > getTypeAllocSize ( ATy ) ;
// Negative distances are not plausible dependencies.
const APInt & Val = C - > getValue ( ) - > getValue ( ) ;
if ( Val . isNegative ( ) ) {
bool IsTrueDataDependence = ( AIsWrite & & ! BIsWrite ) ;
if ( IsTrueDataDependence & &
( couldPreventStoreLoadForward ( Val . abs ( ) . getZExtValue ( ) , TypeByteSize ) | |
ATy ! = BTy ) )
2013-04-08 14:41:23 -04:00
return true ;
2013-12-21 19:04:03 -05:00
DEBUG ( dbgs ( ) < < " LV: Dependence is negative: NoDep \n " ) ;
return false ;
2013-04-08 14:41:23 -04:00
}
2013-12-21 19:04:03 -05:00
// Write to the same location with the same size.
// Could be improved to assert type sizes are the same (i32 == float, etc).
if ( Val = = 0 ) {
if ( ATy = = BTy )
return false ;
DEBUG ( dbgs ( ) < < " LV: Zero dependence difference but different types \n " ) ;
return true ;
}
assert ( Val . isStrictlyPositive ( ) & & " Expect a positive value " ) ;
// Positive distance bigger than max vectorization factor.
if ( ATy ! = BTy ) {
DEBUG ( dbgs ( ) < <
" LV: ReadWrite-Write positive dependency with different types \n " ) ;
return false ;
}
unsigned Distance = ( unsigned ) Val . getZExtValue ( ) ;
// Bail out early if passed-in parameters make vectorization not feasible.
unsigned ForcedFactor = VectorizationFactor ? VectorizationFactor : 1 ;
2015-01-18 11:17:27 -05:00
unsigned ForcedUnroll = VectorizationInterleave ? VectorizationInterleave : 1 ;
2013-12-21 19:04:03 -05:00
// The distance must be bigger than the size needed for a vectorized version
// of the operation and the size of the vectorized operation must not be
// bigger than the currrent maximum size.
if ( Distance < 2 * TypeByteSize | |
2 * TypeByteSize > MaxSafeDepDistBytes | |
Distance < TypeByteSize * ForcedUnroll * ForcedFactor ) {
DEBUG ( dbgs ( ) < < " LV: Failure because of Positive distance "
< < Val . getSExtValue ( ) < < ' \n ' ) ;
return true ;
}
MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ?
Distance : MaxSafeDepDistBytes ;
bool IsTrueDataDependence = ( ! AIsWrite & & BIsWrite ) ;
if ( IsTrueDataDependence & &
couldPreventStoreLoadForward ( Distance , TypeByteSize ) )
return true ;
DEBUG ( dbgs ( ) < < " LV: Positive distance " < < Val . getSExtValue ( ) < <
" with max VF = " < < MaxSafeDepDistBytes / TypeByteSize < < ' \n ' ) ;
2013-04-08 14:41:23 -04:00
return false ;
}
2014-11-24 04:08:18 -05:00
bool MemoryDepChecker : : areDepsSafe ( AccessAnalysis : : DepCandidates & AccessSets ,
MemAccessInfoSet & CheckDeps ,
ValueToValueMap & Strides ) {
2013-12-21 19:04:03 -05:00
MaxSafeDepDistBytes = - 1U ;
while ( ! CheckDeps . empty ( ) ) {
MemAccessInfo CurAccess = * CheckDeps . begin ( ) ;
// Get the relevant memory access set.
EquivalenceClasses < MemAccessInfo > : : iterator I =
AccessSets . findValue ( AccessSets . getLeaderValue ( CurAccess ) ) ;
// Check accesses within this set.
EquivalenceClasses < MemAccessInfo > : : member_iterator AI , AE ;
AI = AccessSets . member_begin ( I ) , AE = AccessSets . member_end ( ) ;
// Check every access pair.
while ( AI ! = AE ) {
CheckDeps . erase ( * AI ) ;
2014-11-24 04:08:18 -05:00
EquivalenceClasses < MemAccessInfo > : : member_iterator OI = std : : next ( AI ) ;
2013-12-21 19:04:03 -05:00
while ( OI ! = AE ) {
// Check every accessing instruction pair in program order.
for ( std : : vector < unsigned > : : iterator I1 = Accesses [ * AI ] . begin ( ) ,
I1E = Accesses [ * AI ] . end ( ) ; I1 ! = I1E ; + + I1 )
for ( std : : vector < unsigned > : : iterator I2 = Accesses [ * OI ] . begin ( ) ,
I2E = Accesses [ * OI ] . end ( ) ; I2 ! = I2E ; + + I2 ) {
2014-11-24 04:08:18 -05:00
if ( * I1 < * I2 & & isDependent ( * AI , * I1 , * OI , * I2 , Strides ) )
2013-12-21 19:04:03 -05:00
return false ;
2014-11-24 04:08:18 -05:00
if ( * I2 < * I1 & & isDependent ( * OI , * I2 , * AI , * I1 , Strides ) )
2013-12-21 19:04:03 -05:00
return false ;
}
+ + OI ;
}
AI + + ;
}
}
return true ;
}
2013-04-08 14:41:23 -04:00
bool LoopVectorizationLegality : : canVectorizeMemory ( ) {
2012-12-02 08:10:19 -05:00
typedef SmallVector < Value * , 16 > ValueVector ;
typedef SmallPtrSet < Value * , 16 > ValueSet ;
2013-12-21 19:04:03 -05:00
2012-12-02 08:10:19 -05:00
// Holds the Load and Store *instructions*.
ValueVector Loads ;
ValueVector Stores ;
2013-12-21 19:04:03 -05:00
// Holds all the different accesses in the loop.
unsigned NumReads = 0 ;
unsigned NumReadWrites = 0 ;
2012-12-02 08:10:19 -05:00
PtrRtCheck . Pointers . clear ( ) ;
PtrRtCheck . Need = false ;
2013-06-10 16:36:52 -04:00
const bool IsAnnotatedParallel = TheLoop - > isAnnotatedParallel ( ) ;
2013-12-21 19:04:03 -05:00
MemoryDepChecker DepChecker ( SE , DL , TheLoop ) ;
2013-06-10 16:36:52 -04:00
2013-04-08 14:41:23 -04:00
// For each block.
for ( Loop : : block_iterator bb = TheLoop - > block_begin ( ) ,
be = TheLoop - > block_end ( ) ; bb ! = be ; + + bb ) {
// Scan the BB and collect legal loads and stores.
for ( BasicBlock : : iterator it = ( * bb ) - > begin ( ) , e = ( * bb ) - > end ( ) ; it ! = e ;
+ + it ) {
// If this is a load, save it. If this instruction can read from memory
// but is not a load, then we quit. Notice that we don't handle function
// calls that read or write.
if ( it - > mayReadFromMemory ( ) ) {
2013-12-21 19:04:03 -05:00
// Many math library functions read the rounding mode. We will only
// vectorize a loop if it contains known function calls that don't set
// the flag. Therefore, it is safe to ignore this read from memory.
CallInst * Call = dyn_cast < CallInst > ( it ) ;
if ( Call & & getIntrinsicIDForCall ( Call , TLI ) )
continue ;
2013-04-08 14:41:23 -04:00
LoadInst * Ld = dyn_cast < LoadInst > ( it ) ;
2014-11-24 04:08:18 -05:00
if ( ! Ld | | ( ! Ld - > isSimple ( ) & & ! IsAnnotatedParallel ) ) {
emitAnalysis ( Report ( Ld )
< < " read with atomic ordering or volatile read " ) ;
2013-04-08 14:41:23 -04:00
DEBUG ( dbgs ( ) < < " LV: Found a non-simple load. \n " ) ;
return false ;
}
2014-11-24 04:08:18 -05:00
NumLoads + + ;
2013-04-08 14:41:23 -04:00
Loads . push_back ( Ld ) ;
2013-12-21 19:04:03 -05:00
DepChecker . addAccess ( Ld ) ;
2013-04-08 14:41:23 -04:00
continue ;
2012-12-02 08:10:19 -05:00
}
2013-04-08 14:41:23 -04:00
// Save 'store' instructions. Abort if other instructions write to memory.
if ( it - > mayWriteToMemory ( ) ) {
StoreInst * St = dyn_cast < StoreInst > ( it ) ;
2014-11-24 04:08:18 -05:00
if ( ! St ) {
emitAnalysis ( Report ( it ) < < " instruction cannot be vectorized " ) ;
return false ;
}
2013-06-10 16:36:52 -04:00
if ( ! St - > isSimple ( ) & & ! IsAnnotatedParallel ) {
2014-11-24 04:08:18 -05:00
emitAnalysis ( Report ( St )
< < " write with atomic ordering or volatile write " ) ;
2013-04-08 14:41:23 -04:00
DEBUG ( dbgs ( ) < < " LV: Found a non-simple store. \n " ) ;
return false ;
}
2014-11-24 04:08:18 -05:00
NumStores + + ;
2013-04-08 14:41:23 -04:00
Stores . push_back ( St ) ;
2013-12-21 19:04:03 -05:00
DepChecker . addAccess ( St ) ;
2012-12-02 08:10:19 -05:00
}
2013-12-21 19:04:03 -05:00
} // Next instr.
} // Next block.
2012-12-02 08:10:19 -05:00
// Now we have two lists that hold the loads and the stores.
// Next, we find the pointers that they use.
// Check if we see any stores. If there are no stores, then we don't
// care if the pointers are *restrict*.
if ( ! Stores . size ( ) ) {
2013-04-08 14:41:23 -04:00
DEBUG ( dbgs ( ) < < " LV: Found a read-only loop! \n " ) ;
return true ;
2012-12-02 08:10:19 -05:00
}
2013-12-21 19:04:03 -05:00
AccessAnalysis : : DepCandidates DependentAccesses ;
2014-11-24 04:08:18 -05:00
AccessAnalysis Accesses ( DL , AA , DependentAccesses ) ;
2012-12-02 08:10:19 -05:00
// Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
// multiple times on the same object. If the ptr is accessed twice, once
// for read and once for write, it will only appear once (on the write
// list). This is okay, since we are going to check for conflicts between
// writes and between reads and writes, but not between reads and reads.
ValueSet Seen ;
ValueVector : : iterator I , IE ;
for ( I = Stores . begin ( ) , IE = Stores . end ( ) ; I ! = IE ; + + I ) {
2013-04-08 14:41:23 -04:00
StoreInst * ST = cast < StoreInst > ( * I ) ;
2012-12-02 08:10:19 -05:00
Value * Ptr = ST - > getPointerOperand ( ) ;
if ( isUniform ( Ptr ) ) {
2014-11-24 04:08:18 -05:00
emitAnalysis (
Report ( ST )
< < " write to a loop invariant address could not be vectorized " ) ;
2012-12-02 08:10:19 -05:00
DEBUG ( dbgs ( ) < < " LV: We don't allow storing to uniform addresses \n " ) ;
return false ;
}
2013-12-21 19:04:03 -05:00
// If we did *not* see this pointer before, insert it to the read-write
// list. At this phase it is only a 'write' list.
2015-01-18 11:17:27 -05:00
if ( Seen . insert ( Ptr ) . second ) {
2013-12-21 19:04:03 -05:00
+ + NumReadWrites ;
2014-11-24 04:08:18 -05:00
AliasAnalysis : : Location Loc = AA - > getLocation ( ST ) ;
// The TBAA metadata could have a control dependency on the predication
// condition, so we cannot rely on it when determining whether or not we
// need runtime pointer checks.
if ( blockNeedsPredication ( ST - > getParent ( ) ) )
2015-01-18 11:17:27 -05:00
Loc . AATags . TBAA = nullptr ;
2014-11-24 04:08:18 -05:00
Accesses . addStore ( Loc ) ;
2013-12-21 19:04:03 -05:00
}
2012-12-02 08:10:19 -05:00
}
2013-06-10 16:36:52 -04:00
if ( IsAnnotatedParallel ) {
DEBUG ( dbgs ( )
< < " LV: A loop annotated parallel, ignore memory dependency "
< < " checks. \n " ) ;
return true ;
}
2012-12-02 08:10:19 -05:00
for ( I = Loads . begin ( ) , IE = Loads . end ( ) ; I ! = IE ; + + I ) {
2013-04-08 14:41:23 -04:00
LoadInst * LD = cast < LoadInst > ( * I ) ;
2012-12-02 08:10:19 -05:00
Value * Ptr = LD - > getPointerOperand ( ) ;
// If we did *not* see this pointer before, insert it to the
// read list. If we *did* see it before, then it is already in
// the read-write list. This allows us to vectorize expressions
// such as A[i] += x; Because the address of A[i] is a read-write
// pointer. This only works if the index of A[i] is consecutive.
// If the address of i is unknown (for example A[B[i]]) then we may
// read a few words, modify, and write a few words, and some of the
// words may be written to the same address.
2013-12-21 19:04:03 -05:00
bool IsReadOnlyPtr = false ;
2015-01-18 11:17:27 -05:00
if ( Seen . insert ( Ptr ) . second | |
! isStridedPtr ( SE , DL , Ptr , TheLoop , Strides ) ) {
2013-12-21 19:04:03 -05:00
+ + NumReads ;
IsReadOnlyPtr = true ;
}
2014-11-24 04:08:18 -05:00
AliasAnalysis : : Location Loc = AA - > getLocation ( LD ) ;
// The TBAA metadata could have a control dependency on the predication
// condition, so we cannot rely on it when determining whether or not we
// need runtime pointer checks.
if ( blockNeedsPredication ( LD - > getParent ( ) ) )
2015-01-18 11:17:27 -05:00
Loc . AATags . TBAA = nullptr ;
2014-11-24 04:08:18 -05:00
Accesses . addLoad ( Loc , IsReadOnlyPtr ) ;
2012-12-02 08:10:19 -05:00
}
// If we write (or read-write) to a single destination and there are no
// other reads in this loop then is it safe to vectorize.
2013-12-21 19:04:03 -05:00
if ( NumReadWrites = = 1 & & NumReads = = 0 ) {
2012-12-02 08:10:19 -05:00
DEBUG ( dbgs ( ) < < " LV: Found a write-only loop! \n " ) ;
return true ;
}
2013-12-21 19:04:03 -05:00
// Build dependence sets and check whether we need a runtime pointer bounds
// check.
Accesses . buildDependenceSets ( ) ;
bool NeedRTCheck = Accesses . isRTCheckNeeded ( ) ;
2013-06-10 16:36:52 -04:00
2012-12-02 08:10:19 -05:00
// Find pointers with computable bounds. We are going to use this information
// to place a runtime bound check.
2013-12-21 19:04:03 -05:00
unsigned NumComparisons = 0 ;
bool CanDoRT = false ;
if ( NeedRTCheck )
2014-11-24 04:08:18 -05:00
CanDoRT = Accesses . canCheckPtrAtRT ( PtrRtCheck , NumComparisons , SE , TheLoop ,
Strides ) ;
2013-12-21 19:04:03 -05:00
DEBUG ( dbgs ( ) < < " LV: We need to do " < < NumComparisons < <
" pointer comparisons. \n " ) ;
2012-12-02 08:10:19 -05:00
2013-12-21 19:04:03 -05:00
// If we only have one set of dependences to check pointers among we don't
// need a runtime check.
if ( NumComparisons = = 0 & & NeedRTCheck )
NeedRTCheck = false ;
// Check that we did not collect too many pointers or found an unsizeable
// pointer.
2013-06-10 16:36:52 -04:00
if ( ! CanDoRT | | NumComparisons > RuntimeMemoryCheckThreshold ) {
2013-04-08 14:41:23 -04:00
PtrRtCheck . reset ( ) ;
CanDoRT = false ;
2012-12-02 08:10:19 -05:00
}
2013-04-08 14:41:23 -04:00
if ( CanDoRT ) {
2012-12-02 08:10:19 -05:00
DEBUG ( dbgs ( ) < < " LV: We can perform a memory runtime check if needed. \n " ) ;
}
2013-12-21 19:04:03 -05:00
if ( NeedRTCheck & & ! CanDoRT ) {
2014-11-24 04:08:18 -05:00
emitAnalysis ( Report ( ) < < " cannot identify array bounds " ) ;
2013-12-21 19:04:03 -05:00
DEBUG ( dbgs ( ) < < " LV: We can't vectorize because we can't find " < <
" the array bounds. \n " ) ;
PtrRtCheck . reset ( ) ;
return false ;
}
2013-04-08 14:41:23 -04:00
2013-12-21 19:04:03 -05:00
PtrRtCheck . Need = NeedRTCheck ;
bool CanVecMem = true ;
if ( Accesses . isDependencyCheckNeeded ( ) ) {
DEBUG ( dbgs ( ) < < " LV: Checking memory dependencies \n " ) ;
2014-11-24 04:08:18 -05:00
CanVecMem = DepChecker . areDepsSafe (
DependentAccesses , Accesses . getDependenciesToCheck ( ) , Strides ) ;
2013-12-21 19:04:03 -05:00
MaxSafeDepDistBytes = DepChecker . getMaxSafeDepDistBytes ( ) ;
if ( ! CanVecMem & & DepChecker . shouldRetryWithRuntimeCheck ( ) ) {
DEBUG ( dbgs ( ) < < " LV: Retrying with memory checks \n " ) ;
NeedRTCheck = true ;
// Clear the dependency checks. We assume they are not needed.
Accesses . resetDepChecks ( ) ;
PtrRtCheck . reset ( ) ;
PtrRtCheck . Need = true ;
CanDoRT = Accesses . canCheckPtrAtRT ( PtrRtCheck , NumComparisons , SE ,
2014-11-24 04:08:18 -05:00
TheLoop , Strides , true ) ;
2013-12-21 19:04:03 -05:00
// Check that we did not collect too many pointers or found an unsizeable
// pointer.
if ( ! CanDoRT | | NumComparisons > RuntimeMemoryCheckThreshold ) {
2014-11-24 04:08:18 -05:00
if ( ! CanDoRT & & NumComparisons > 0 )
emitAnalysis ( Report ( )
< < " cannot check memory dependencies at runtime " ) ;
else
emitAnalysis ( Report ( )
< < NumComparisons < < " exceeds limit of "
< < RuntimeMemoryCheckThreshold
< < " dependent memory operations checked at runtime " ) ;
2013-12-21 19:04:03 -05:00
DEBUG ( dbgs ( ) < < " LV: Can't vectorize with memory checks \n " ) ;
PtrRtCheck . reset ( ) ;
2013-04-08 14:41:23 -04:00
return false ;
2012-12-02 08:10:19 -05:00
}
2013-04-08 14:41:23 -04:00
2013-12-21 19:04:03 -05:00
CanVecMem = true ;
2012-12-02 08:10:19 -05:00
}
}
2014-11-24 04:08:18 -05:00
if ( ! CanVecMem )
emitAnalysis ( Report ( ) < < " unsafe dependent memory operations in loop " ) ;
2013-12-21 19:04:03 -05:00
DEBUG ( dbgs ( ) < < " LV: We " < < ( NeedRTCheck ? " " : " don't " ) < <
" need a runtime memory check. \n " ) ;
2013-04-08 14:41:23 -04:00
2013-12-21 19:04:03 -05:00
return CanVecMem ;
}
2012-12-02 08:10:19 -05:00
2013-12-21 19:04:03 -05:00
static bool hasMultipleUsesOf ( Instruction * I ,
2015-01-18 11:17:27 -05:00
SmallPtrSetImpl < Instruction * > & Insts ) {
2013-12-21 19:04:03 -05:00
unsigned NumUses = 0 ;
for ( User : : op_iterator Use = I - > op_begin ( ) , E = I - > op_end ( ) ; Use ! = E ; + + Use ) {
if ( Insts . count ( dyn_cast < Instruction > ( * Use ) ) )
+ + NumUses ;
if ( NumUses > 1 )
return true ;
2013-04-08 14:41:23 -04:00
}
2013-12-21 19:04:03 -05:00
return false ;
}
2015-01-18 11:17:27 -05:00
static bool areAllUsesIn ( Instruction * I , SmallPtrSetImpl < Instruction * > & Set ) {
2013-12-21 19:04:03 -05:00
for ( User : : op_iterator Use = I - > op_begin ( ) , E = I - > op_end ( ) ; Use ! = E ; + + Use )
if ( ! Set . count ( dyn_cast < Instruction > ( * Use ) ) )
return false ;
2012-12-02 08:10:19 -05:00
return true ;
}
bool LoopVectorizationLegality : : AddReductionVar ( PHINode * Phi ,
ReductionKind Kind ) {
if ( Phi - > getNumIncomingValues ( ) ! = 2 )
return false ;
2013-04-08 14:41:23 -04:00
// Reduction variables are only found in the loop header block.
if ( Phi - > getParent ( ) ! = TheLoop - > getHeader ( ) )
return false ;
// Obtain the reduction start value from the value that comes from the loop
// preheader.
Value * RdxStart = Phi - > getIncomingValueForBlock ( TheLoop - > getLoopPreheader ( ) ) ;
2012-12-02 08:10:19 -05:00
// ExitInstruction is the single value which is used outside the loop.
// We only allow for a single reduction value to be used outside the loop.
// This includes users of the reduction, variables (which form a cycle
// which ends in the phi node).
2014-11-24 04:08:18 -05:00
Instruction * ExitInstruction = nullptr ;
2013-12-21 19:04:03 -05:00
// Indicates that we found a reduction operation in our scan.
bool FoundReduxOp = false ;
2012-12-02 08:10:19 -05:00
2013-12-21 19:04:03 -05:00
// We start with the PHI node and scan for all of the users of this
// instruction. All users must be instructions that can be used as reduction
// variables (such as ADD). We must have a single out-of-block user. The cycle
// must include the original PHI.
bool FoundStartPHI = false ;
2013-06-10 16:36:52 -04:00
// To recognize min/max patterns formed by a icmp select sequence, we store
// the number of instruction we saw from the recognized min/max pattern,
2013-12-21 19:04:03 -05:00
// to make sure we only see exactly the two instructions.
2013-06-10 16:36:52 -04:00
unsigned NumCmpSelectPatternInst = 0 ;
2014-11-24 04:08:18 -05:00
ReductionInstDesc ReduxDesc ( false , nullptr ) ;
2013-06-10 16:36:52 -04:00
SmallPtrSet < Instruction * , 8 > VisitedInsts ;
2013-12-21 19:04:03 -05:00
SmallVector < Instruction * , 8 > Worklist ;
Worklist . push_back ( Phi ) ;
VisitedInsts . insert ( Phi ) ;
// A value in the reduction can be used:
// - By the reduction:
// - Reduction operation:
// - One use of reduction value (safe).
// - Multiple use of reduction value (not safe).
// - PHI:
// - All uses of the PHI must be the reduction (safe).
// - Otherwise, not safe.
// - By one instruction outside of the loop (safe).
// - By further instructions outside of the loop (not safe).
// - By an instruction that is not part of the reduction (not safe).
// This is either:
// * An instruction type other than PHI or the reduction operation.
// * A PHI in the header other than the initial PHI.
while ( ! Worklist . empty ( ) ) {
Instruction * Cur = Worklist . back ( ) ;
Worklist . pop_back ( ) ;
// No Users.
// If the instruction has no users then this is a broken chain and can't be
// a reduction variable.
if ( Cur - > use_empty ( ) )
2012-12-02 08:10:19 -05:00
return false ;
2013-12-21 19:04:03 -05:00
bool IsAPhi = isa < PHINode > ( Cur ) ;
2012-12-02 08:10:19 -05:00
2013-12-21 19:04:03 -05:00
// A header PHI use other than the original PHI.
if ( Cur ! = Phi & & IsAPhi & & Cur - > getParent ( ) = = Phi - > getParent ( ) )
return false ;
2013-04-08 14:41:23 -04:00
2013-12-21 19:04:03 -05:00
// Reductions of instructions such as Div, and Sub is only possible if the
// LHS is the reduction variable.
if ( ! Cur - > isCommutative ( ) & & ! IsAPhi & & ! isa < SelectInst > ( Cur ) & &
! isa < ICmpInst > ( Cur ) & & ! isa < FCmpInst > ( Cur ) & &
! VisitedInsts . count ( dyn_cast < Instruction > ( Cur - > getOperand ( 0 ) ) ) )
return false ;
// Any reduction instruction must be of one of the allowed kinds.
ReduxDesc = isReductionInstr ( Cur , Kind , ReduxDesc ) ;
if ( ! ReduxDesc . IsReduction )
return false ;
// A reduction operation must only have one use of the reduction value.
if ( ! IsAPhi & & Kind ! = RK_IntegerMinMax & & Kind ! = RK_FloatMinMax & &
hasMultipleUsesOf ( Cur , VisitedInsts ) )
return false ;
// All inputs to a PHI node must be a reduction value.
if ( IsAPhi & & Cur ! = Phi & & ! areAllUsesIn ( Cur , VisitedInsts ) )
return false ;
if ( Kind = = RK_IntegerMinMax & & ( isa < ICmpInst > ( Cur ) | |
isa < SelectInst > ( Cur ) ) )
+ + NumCmpSelectPatternInst ;
if ( Kind = = RK_FloatMinMax & & ( isa < FCmpInst > ( Cur ) | |
isa < SelectInst > ( Cur ) ) )
+ + NumCmpSelectPatternInst ;
// Check whether we found a reduction operator.
FoundReduxOp | = ! IsAPhi ;
2014-11-24 04:08:18 -05:00
// Process users of current instruction. Push non-PHI nodes after PHI nodes
2013-12-21 19:04:03 -05:00
// onto the stack. This way we are going to have seen all inputs to PHI
// nodes once we get to them.
SmallVector < Instruction * , 8 > NonPHIs ;
SmallVector < Instruction * , 8 > PHIs ;
2014-11-24 04:08:18 -05:00
for ( User * U : Cur - > users ( ) ) {
Instruction * UI = cast < Instruction > ( U ) ;
2013-04-08 14:41:23 -04:00
2012-12-02 08:10:19 -05:00
// Check if we found the exit user.
2014-11-24 04:08:18 -05:00
BasicBlock * Parent = UI - > getParent ( ) ;
2013-04-08 14:41:23 -04:00
if ( ! TheLoop - > contains ( Parent ) ) {
2013-12-21 19:04:03 -05:00
// Exit if you find multiple outside users or if the header phi node is
// being used. In this case the user uses the value of the previous
// iteration, in which case we would loose "VF-1" iterations of the
// reduction operation if we vectorize.
2014-11-24 04:08:18 -05:00
if ( ExitInstruction ! = nullptr | | Cur = = Phi )
2012-12-02 08:10:19 -05:00
return false ;
2013-04-08 14:41:23 -04:00
2013-12-21 19:04:03 -05:00
// The instruction used by an outside user must be the last instruction
// before we feed back to the reduction phi. Otherwise, we loose VF-1
// operations on the value.
if ( std : : find ( Phi - > op_begin ( ) , Phi - > op_end ( ) , Cur ) = = Phi - > op_end ( ) )
return false ;
2013-04-08 14:41:23 -04:00
2013-12-21 19:04:03 -05:00
ExitInstruction = Cur ;
continue ;
}
2013-04-08 14:41:23 -04:00
2014-05-11 14:24:26 -04:00
// Process instructions only once (termination). Each reduction cycle
// value must only be used once, except by phi nodes and min/max
// reductions which are represented as a cmp followed by a select.
2014-11-24 04:08:18 -05:00
ReductionInstDesc IgnoredVal ( false , nullptr ) ;
2015-01-18 11:17:27 -05:00
if ( VisitedInsts . insert ( UI ) . second ) {
2014-11-24 04:08:18 -05:00
if ( isa < PHINode > ( UI ) )
PHIs . push_back ( UI ) ;
2013-12-21 19:04:03 -05:00
else
2014-11-24 04:08:18 -05:00
NonPHIs . push_back ( UI ) ;
} else if ( ! isa < PHINode > ( UI ) & &
( ( ! isa < FCmpInst > ( UI ) & &
! isa < ICmpInst > ( UI ) & &
! isa < SelectInst > ( UI ) ) | |
! isMinMaxSelectCmpPattern ( UI , IgnoredVal ) . IsReduction ) )
2014-05-11 14:24:26 -04:00
return false ;
2013-12-21 19:04:03 -05:00
// Remember that we completed the cycle.
2014-11-24 04:08:18 -05:00
if ( UI = = Phi )
2013-12-21 19:04:03 -05:00
FoundStartPHI = true ;
}
Worklist . append ( PHIs . begin ( ) , PHIs . end ( ) ) ;
Worklist . append ( NonPHIs . begin ( ) , NonPHIs . end ( ) ) ;
}
2013-06-10 16:36:52 -04:00
2013-12-21 19:04:03 -05:00
// This means we have seen one but not the other instruction of the
// pattern or more than just a select and cmp.
if ( ( Kind = = RK_IntegerMinMax | | Kind = = RK_FloatMinMax ) & &
NumCmpSelectPatternInst ! = 2 )
return false ;
2013-04-08 14:41:23 -04:00
2013-12-21 19:04:03 -05:00
if ( ! FoundStartPHI | | ! FoundReduxOp | | ! ExitInstruction )
return false ;
2012-12-02 08:10:19 -05:00
2013-12-21 19:04:03 -05:00
// We found a reduction var if we have reached the original phi node and we
// only have a single instruction with out-of-loop users.
2013-04-08 14:41:23 -04:00
2013-12-21 19:04:03 -05:00
// This instruction is allowed to have out-of-loop users.
AllowedExit . insert ( ExitInstruction ) ;
2013-04-08 14:41:23 -04:00
2013-12-21 19:04:03 -05:00
// Save the description of this reduction variable.
ReductionDescriptor RD ( RdxStart , ExitInstruction , Kind ,
ReduxDesc . MinMaxKind ) ;
Reductions [ Phi ] = RD ;
// We've ended the cycle. This is a reduction variable if we have an
// outside user and it has a binary op.
2013-06-10 16:36:52 -04:00
2013-12-21 19:04:03 -05:00
return true ;
2012-12-02 08:10:19 -05:00
}
2013-06-10 16:36:52 -04:00
/// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction
/// pattern corresponding to a min(X, Y) or max(X, Y).
LoopVectorizationLegality : : ReductionInstDesc
LoopVectorizationLegality : : isMinMaxSelectCmpPattern ( Instruction * I ,
ReductionInstDesc & Prev ) {
assert ( ( isa < ICmpInst > ( I ) | | isa < FCmpInst > ( I ) | | isa < SelectInst > ( I ) ) & &
" Expect a select instruction " ) ;
2014-11-24 04:08:18 -05:00
Instruction * Cmp = nullptr ;
SelectInst * Select = nullptr ;
2013-06-10 16:36:52 -04:00
// We must handle the select(cmp()) as a single instruction. Advance to the
// select.
if ( ( Cmp = dyn_cast < ICmpInst > ( I ) ) | | ( Cmp = dyn_cast < FCmpInst > ( I ) ) ) {
2014-11-24 04:08:18 -05:00
if ( ! Cmp - > hasOneUse ( ) | | ! ( Select = dyn_cast < SelectInst > ( * I - > user_begin ( ) ) ) )
2013-06-10 16:36:52 -04:00
return ReductionInstDesc ( false , I ) ;
return ReductionInstDesc ( Select , Prev . MinMaxKind ) ;
}
// Only handle single use cases for now.
if ( ! ( Select = dyn_cast < SelectInst > ( I ) ) )
return ReductionInstDesc ( false , I ) ;
if ( ! ( Cmp = dyn_cast < ICmpInst > ( I - > getOperand ( 0 ) ) ) & &
! ( Cmp = dyn_cast < FCmpInst > ( I - > getOperand ( 0 ) ) ) )
return ReductionInstDesc ( false , I ) ;
if ( ! Cmp - > hasOneUse ( ) )
return ReductionInstDesc ( false , I ) ;
Value * CmpLeft ;
Value * CmpRight ;
// Look for a min/max pattern.
if ( m_UMin ( m_Value ( CmpLeft ) , m_Value ( CmpRight ) ) . match ( Select ) )
return ReductionInstDesc ( Select , MRK_UIntMin ) ;
else if ( m_UMax ( m_Value ( CmpLeft ) , m_Value ( CmpRight ) ) . match ( Select ) )
return ReductionInstDesc ( Select , MRK_UIntMax ) ;
else if ( m_SMax ( m_Value ( CmpLeft ) , m_Value ( CmpRight ) ) . match ( Select ) )
return ReductionInstDesc ( Select , MRK_SIntMax ) ;
else if ( m_SMin ( m_Value ( CmpLeft ) , m_Value ( CmpRight ) ) . match ( Select ) )
return ReductionInstDesc ( Select , MRK_SIntMin ) ;
else if ( m_OrdFMin ( m_Value ( CmpLeft ) , m_Value ( CmpRight ) ) . match ( Select ) )
return ReductionInstDesc ( Select , MRK_FloatMin ) ;
else if ( m_OrdFMax ( m_Value ( CmpLeft ) , m_Value ( CmpRight ) ) . match ( Select ) )
return ReductionInstDesc ( Select , MRK_FloatMax ) ;
else if ( m_UnordFMin ( m_Value ( CmpLeft ) , m_Value ( CmpRight ) ) . match ( Select ) )
return ReductionInstDesc ( Select , MRK_FloatMin ) ;
else if ( m_UnordFMax ( m_Value ( CmpLeft ) , m_Value ( CmpRight ) ) . match ( Select ) )
return ReductionInstDesc ( Select , MRK_FloatMax ) ;
return ReductionInstDesc ( false , I ) ;
}
LoopVectorizationLegality : : ReductionInstDesc
2012-12-02 08:10:19 -05:00
LoopVectorizationLegality : : isReductionInstr ( Instruction * I ,
2013-06-10 16:36:52 -04:00
ReductionKind Kind ,
ReductionInstDesc & Prev ) {
2013-04-08 14:41:23 -04:00
bool FP = I - > getType ( ) - > isFloatingPointTy ( ) ;
2015-01-18 11:17:27 -05:00
bool FastMath = FP & & I - > hasUnsafeAlgebra ( ) ;
2013-04-08 14:41:23 -04:00
switch ( I - > getOpcode ( ) ) {
default :
2013-06-10 16:36:52 -04:00
return ReductionInstDesc ( false , I ) ;
2013-04-08 14:41:23 -04:00
case Instruction : : PHI :
2013-06-10 16:36:52 -04:00
if ( FP & & ( Kind ! = RK_FloatMult & & Kind ! = RK_FloatAdd & &
Kind ! = RK_FloatMinMax ) )
return ReductionInstDesc ( false , I ) ;
return ReductionInstDesc ( I , Prev . MinMaxKind ) ;
2013-04-08 14:41:23 -04:00
case Instruction : : Sub :
case Instruction : : Add :
2013-06-10 16:36:52 -04:00
return ReductionInstDesc ( Kind = = RK_IntegerAdd , I ) ;
2013-04-08 14:41:23 -04:00
case Instruction : : Mul :
2013-06-10 16:36:52 -04:00
return ReductionInstDesc ( Kind = = RK_IntegerMult , I ) ;
2013-04-08 14:41:23 -04:00
case Instruction : : And :
2013-06-10 16:36:52 -04:00
return ReductionInstDesc ( Kind = = RK_IntegerAnd , I ) ;
2013-04-08 14:41:23 -04:00
case Instruction : : Or :
2013-06-10 16:36:52 -04:00
return ReductionInstDesc ( Kind = = RK_IntegerOr , I ) ;
2013-04-08 14:41:23 -04:00
case Instruction : : Xor :
2013-06-10 16:36:52 -04:00
return ReductionInstDesc ( Kind = = RK_IntegerXor , I ) ;
2013-04-08 14:41:23 -04:00
case Instruction : : FMul :
2013-06-10 16:36:52 -04:00
return ReductionInstDesc ( Kind = = RK_FloatMult & & FastMath , I ) ;
2015-01-18 11:17:27 -05:00
case Instruction : : FSub :
2013-04-08 14:41:23 -04:00
case Instruction : : FAdd :
2013-06-10 16:36:52 -04:00
return ReductionInstDesc ( Kind = = RK_FloatAdd & & FastMath , I ) ;
case Instruction : : FCmp :
case Instruction : : ICmp :
case Instruction : : Select :
if ( Kind ! = RK_IntegerMinMax & &
( ! HasFunNoNaNAttr | | Kind ! = RK_FloatMinMax ) )
return ReductionInstDesc ( false , I ) ;
return isMinMaxSelectCmpPattern ( I , Prev ) ;
}
2012-12-02 08:10:19 -05:00
}
2013-04-08 14:41:23 -04:00
LoopVectorizationLegality : : InductionKind
LoopVectorizationLegality : : isInductionVariable ( PHINode * Phi ) {
Type * PhiTy = Phi - > getType ( ) ;
// We only handle integer and pointer inductions variables.
if ( ! PhiTy - > isIntegerTy ( ) & & ! PhiTy - > isPointerTy ( ) )
return IK_NoInduction ;
// Check that the PHI is consecutive.
2012-12-02 08:10:19 -05:00
const SCEV * PhiScev = SE - > getSCEV ( Phi ) ;
const SCEVAddRecExpr * AR = dyn_cast < SCEVAddRecExpr > ( PhiScev ) ;
if ( ! AR ) {
DEBUG ( dbgs ( ) < < " LV: PHI is not a poly recurrence. \n " ) ;
2013-04-08 14:41:23 -04:00
return IK_NoInduction ;
2012-12-02 08:10:19 -05:00
}
const SCEV * Step = AR - > getStepRecurrence ( * SE ) ;
2013-04-08 14:41:23 -04:00
// Integer inductions need to have a stride of one.
if ( PhiTy - > isIntegerTy ( ) ) {
if ( Step - > isOne ( ) )
return IK_IntInduction ;
if ( Step - > isAllOnesValue ( ) )
return IK_ReverseIntInduction ;
return IK_NoInduction ;
}
// Calculate the pointer stride and check if it is consecutive.
const SCEVConstant * C = dyn_cast < SCEVConstant > ( Step ) ;
if ( ! C )
return IK_NoInduction ;
assert ( PhiTy - > isPointerTy ( ) & & " The PHI must be a pointer " ) ;
2015-01-15 17:30:16 -05:00
Type * PointerElementType = PhiTy - > getPointerElementType ( ) ;
// The pointer stride cannot be determined if the pointer element type is not
// sized.
if ( ! PointerElementType - > isSized ( ) )
return IK_NoInduction ;
uint64_t Size = DL - > getTypeAllocSize ( PointerElementType ) ;
2013-04-08 14:41:23 -04:00
if ( C - > getValue ( ) - > equalsInt ( Size ) )
return IK_PtrInduction ;
else if ( C - > getValue ( ) - > equalsInt ( 0 - Size ) )
return IK_ReversePtrInduction ;
return IK_NoInduction ;
}
bool LoopVectorizationLegality : : isInductionVariable ( const Value * V ) {
Value * In0 = const_cast < Value * > ( V ) ;
PHINode * PN = dyn_cast_or_null < PHINode > ( In0 ) ;
if ( ! PN )
2012-12-02 08:10:19 -05:00
return false ;
2013-04-08 14:41:23 -04:00
return Inductions . count ( PN ) ;
}
bool LoopVectorizationLegality : : blockNeedsPredication ( BasicBlock * BB ) {
assert ( TheLoop - > contains ( BB ) & & " Unknown block used " ) ;
// Blocks that do not dominate the latch need predication.
BasicBlock * Latch = TheLoop - > getLoopLatch ( ) ;
return ! DT - > dominates ( BB , Latch ) ;
}
2013-12-21 19:04:03 -05:00
bool LoopVectorizationLegality : : blockCanBePredicated ( BasicBlock * BB ,
2015-01-18 11:17:27 -05:00
SmallPtrSetImpl < Value * > & SafePtrs ) {
2013-04-08 14:41:23 -04:00
for ( BasicBlock : : iterator it = BB - > begin ( ) , e = BB - > end ( ) ; it ! = e ; + + it ) {
2015-01-18 11:17:27 -05:00
// Check that we don't have a constant expression that can trap as operand.
for ( Instruction : : op_iterator OI = it - > op_begin ( ) , OE = it - > op_end ( ) ;
OI ! = OE ; + + OI ) {
if ( Constant * C = dyn_cast < Constant > ( * OI ) )
if ( C - > canTrap ( ) )
return false ;
}
2013-12-21 19:04:03 -05:00
// We might be able to hoist the load.
if ( it - > mayReadFromMemory ( ) ) {
LoadInst * LI = dyn_cast < LoadInst > ( it ) ;
2015-01-18 11:17:27 -05:00
if ( ! LI )
return false ;
if ( ! SafePtrs . count ( LI - > getPointerOperand ( ) ) ) {
if ( isLegalMaskedLoad ( LI - > getType ( ) , LI - > getPointerOperand ( ) ) ) {
MaskedOp . insert ( LI ) ;
continue ;
}
2013-12-21 19:04:03 -05:00
return false ;
2015-01-18 11:17:27 -05:00
}
2013-12-21 19:04:03 -05:00
}
// We don't predicate stores at the moment.
2014-11-24 04:08:18 -05:00
if ( it - > mayWriteToMemory ( ) ) {
StoreInst * SI = dyn_cast < StoreInst > ( it ) ;
// We only support predication of stores in basic blocks with one
// predecessor.
2015-01-18 11:17:27 -05:00
if ( ! SI )
return false ;
bool isSafePtr = ( SafePtrs . count ( SI - > getPointerOperand ( ) ) ! = 0 ) ;
bool isSinglePredecessor = SI - > getParent ( ) - > getSinglePredecessor ( ) ;
if ( + + NumPredStores > NumberOfStoresToPredicate | | ! isSafePtr | |
! isSinglePredecessor ) {
// Build a masked store if it is legal for the target, otherwise scalarize
// the block.
bool isLegalMaskedOp =
isLegalMaskedStore ( SI - > getValueOperand ( ) - > getType ( ) ,
SI - > getPointerOperand ( ) ) ;
if ( isLegalMaskedOp ) {
- - NumPredStores ;
MaskedOp . insert ( SI ) ;
continue ;
}
2014-11-24 04:08:18 -05:00
return false ;
2015-01-18 11:17:27 -05:00
}
2014-11-24 04:08:18 -05:00
}
if ( it - > mayThrow ( ) )
2013-04-08 14:41:23 -04:00
return false ;
// The instructions below can trap.
switch ( it - > getOpcode ( ) ) {
default : continue ;
case Instruction : : UDiv :
case Instruction : : SDiv :
case Instruction : : URem :
case Instruction : : SRem :
2015-01-18 11:17:27 -05:00
return false ;
2013-04-08 14:41:23 -04:00
}
2012-12-02 08:10:19 -05:00
}
2013-04-08 14:41:23 -04:00
2012-12-02 08:10:19 -05:00
return true ;
}
2013-04-08 14:41:23 -04:00
LoopVectorizationCostModel : : VectorizationFactor
2015-01-18 11:17:27 -05:00
LoopVectorizationCostModel : : selectVectorizationFactor ( bool OptForSize ) {
2013-04-08 14:41:23 -04:00
// Width 1 means no vectorize
VectorizationFactor Factor = { 1U , 0U } ;
if ( OptForSize & & Legal - > getRuntimePointerCheck ( ) - > Need ) {
2015-01-18 11:17:27 -05:00
emitAnalysis ( Report ( ) < < " runtime pointer checks needed. Enable vectorization of this loop with '#pragma clang loop vectorize(enable)' when compiling with -Os " ) ;
2013-04-08 14:41:23 -04:00
DEBUG ( dbgs ( ) < < " LV: Aborting. Runtime ptr check is required in Os. \n " ) ;
return Factor ;
}
2014-11-24 04:08:18 -05:00
if ( ! EnableCondStoresVectorization & & Legal - > NumPredStores ) {
2015-01-18 11:17:27 -05:00
emitAnalysis ( Report ( ) < < " store that is conditionally executed prevents vectorization " ) ;
2014-11-24 04:08:18 -05:00
DEBUG ( dbgs ( ) < < " LV: No vectorization. There are conditional stores. \n " ) ;
return Factor ;
}
2013-04-08 14:41:23 -04:00
// Find the trip count.
2015-01-18 11:17:27 -05:00
unsigned TC = SE - > getSmallConstantTripCount ( TheLoop ) ;
2013-12-21 19:04:03 -05:00
DEBUG ( dbgs ( ) < < " LV: Found trip count: " < < TC < < ' \n ' ) ;
2013-04-08 14:41:23 -04:00
unsigned WidestType = getWidestType ( ) ;
unsigned WidestRegister = TTI . getRegisterBitWidth ( true ) ;
2013-12-21 19:04:03 -05:00
unsigned MaxSafeDepDist = - 1U ;
if ( Legal - > getMaxSafeDepDistBytes ( ) ! = - 1U )
MaxSafeDepDist = Legal - > getMaxSafeDepDistBytes ( ) * 8 ;
WidestRegister = ( ( WidestRegister < MaxSafeDepDist ) ?
WidestRegister : MaxSafeDepDist ) ;
2013-04-08 14:41:23 -04:00
unsigned MaxVectorSize = WidestRegister / WidestType ;
DEBUG ( dbgs ( ) < < " LV: The Widest type: " < < WidestType < < " bits. \n " ) ;
2013-12-21 19:04:03 -05:00
DEBUG ( dbgs ( ) < < " LV: The Widest register is: "
< < WidestRegister < < " bits. \n " ) ;
2013-04-08 14:41:23 -04:00
if ( MaxVectorSize = = 0 ) {
DEBUG ( dbgs ( ) < < " LV: The target has no vector registers. \n " ) ;
MaxVectorSize = 1 ;
}
2015-01-18 11:17:27 -05:00
assert ( MaxVectorSize < = 64 & & " Did not expect to pack so many elements "
2013-04-08 14:41:23 -04:00
" into one vector! " ) ;
unsigned VF = MaxVectorSize ;
// If we optimize the program for size, avoid creating the tail loop.
if ( OptForSize ) {
// If we are unable to calculate the trip count then don't try to vectorize.
if ( TC < 2 ) {
2015-01-18 11:17:27 -05:00
emitAnalysis ( Report ( ) < < " unable to calculate the loop count due to complex control flow " ) ;
2013-04-08 14:41:23 -04:00
DEBUG ( dbgs ( ) < < " LV: Aborting. A tail loop is required in Os. \n " ) ;
return Factor ;
}
// Find the maximum SIMD width that can fit within the trip count.
VF = TC % MaxVectorSize ;
if ( VF = = 0 )
VF = MaxVectorSize ;
// If the trip count that we found modulo the vectorization factor is not
// zero then we require a tail.
if ( VF < 2 ) {
2015-01-18 11:17:27 -05:00
emitAnalysis ( Report ( ) < < " cannot optimize for size and vectorize at the "
" same time. Enable vectorization of this loop "
" with '#pragma clang loop vectorize(enable)' "
" when compiling with -Os " ) ;
2013-04-08 14:41:23 -04:00
DEBUG ( dbgs ( ) < < " LV: Aborting. A tail loop is required in Os. \n " ) ;
return Factor ;
}
}
2015-01-18 11:17:27 -05:00
int UserVF = Hints - > getWidth ( ) ;
2013-04-08 14:41:23 -04:00
if ( UserVF ! = 0 ) {
assert ( isPowerOf2_32 ( UserVF ) & & " VF needs to be a power of two " ) ;
2013-12-21 19:04:03 -05:00
DEBUG ( dbgs ( ) < < " LV: Using user VF " < < UserVF < < " . \n " ) ;
2013-04-08 14:41:23 -04:00
Factor . Width = UserVF ;
return Factor ;
2012-12-02 08:10:19 -05:00
}
float Cost = expectedCost ( 1 ) ;
2014-11-24 04:08:18 -05:00
# ifndef NDEBUG
const float ScalarCost = Cost ;
# endif /* NDEBUG */
2012-12-02 08:10:19 -05:00
unsigned Width = 1 ;
2014-11-24 04:08:18 -05:00
DEBUG ( dbgs ( ) < < " LV: Scalar loop costs: " < < ( int ) ScalarCost < < " . \n " ) ;
2015-01-18 11:17:27 -05:00
bool ForceVectorization = Hints - > getForce ( ) = = LoopVectorizeHints : : FK_Enabled ;
2014-11-24 04:08:18 -05:00
// Ignore scalar width, because the user explicitly wants vectorization.
if ( ForceVectorization & & VF > 1 ) {
Width = 2 ;
Cost = expectedCost ( Width ) / ( float ) Width ;
}
2012-12-02 08:10:19 -05:00
for ( unsigned i = 2 ; i < = VF ; i * = 2 ) {
// Notice that the vector loop needs to be executed less times, so
// we need to divide the cost of the vector loops by the width of
// the vector elements.
float VectorCost = expectedCost ( i ) / ( float ) i ;
2013-12-21 19:04:03 -05:00
DEBUG ( dbgs ( ) < < " LV: Vector loop of width " < < i < < " costs: " < <
2012-12-02 08:10:19 -05:00
( int ) VectorCost < < " . \n " ) ;
if ( VectorCost < Cost ) {
Cost = VectorCost ;
Width = i ;
}
}
2014-11-24 04:08:18 -05:00
DEBUG ( if ( ForceVectorization & & Width > 1 & & Cost > = ScalarCost ) dbgs ( )
< < " LV: Vectorization seems to be not beneficial, "
< < " but was forced by a user. \n " ) ;
DEBUG ( dbgs ( ) < < " LV: Selecting VF: " < < Width < < " . \n " ) ;
2013-04-08 14:41:23 -04:00
Factor . Width = Width ;
Factor . Cost = Width * Cost ;
return Factor ;
2012-12-02 08:10:19 -05:00
}
2013-04-08 14:41:23 -04:00
unsigned LoopVectorizationCostModel : : getWidestType ( ) {
unsigned MaxWidth = 8 ;
// For each block.
for ( Loop : : block_iterator bb = TheLoop - > block_begin ( ) ,
be = TheLoop - > block_end ( ) ; bb ! = be ; + + bb ) {
BasicBlock * BB = * bb ;
// For each instruction in the loop.
for ( BasicBlock : : iterator it = BB - > begin ( ) , e = BB - > end ( ) ; it ! = e ; + + it ) {
Type * T = it - > getType ( ) ;
2015-01-18 11:17:27 -05:00
// Ignore ephemeral values.
if ( EphValues . count ( it ) )
continue ;
2013-04-08 14:41:23 -04:00
// Only examine Loads, Stores and PHINodes.
if ( ! isa < LoadInst > ( it ) & & ! isa < StoreInst > ( it ) & & ! isa < PHINode > ( it ) )
continue ;
2012-12-02 08:10:19 -05:00
2013-04-08 14:41:23 -04:00
// Examine PHI nodes that are reduction variables.
if ( PHINode * PN = dyn_cast < PHINode > ( it ) )
if ( ! Legal - > getReductionVars ( ) - > count ( PN ) )
continue ;
// Examine the stored values.
if ( StoreInst * ST = dyn_cast < StoreInst > ( it ) )
T = ST - > getValueOperand ( ) - > getType ( ) ;
// Ignore loaded pointer types and stored pointer types that are not
// consecutive. However, we do want to take consecutive stores/loads of
// pointer vectors into account.
if ( T - > isPointerTy ( ) & & ! isConsecutiveLoadOrStore ( it ) )
continue ;
MaxWidth = std : : max ( MaxWidth ,
( unsigned ) DL - > getTypeSizeInBits ( T - > getScalarType ( ) ) ) ;
}
}
return MaxWidth ;
}
unsigned
LoopVectorizationCostModel : : selectUnrollFactor ( bool OptForSize ,
unsigned VF ,
unsigned LoopCost ) {
// -- The unroll heuristics --
// We unroll the loop in order to expose ILP and reduce the loop overhead.
// There are many micro-architectural considerations that we can't predict
2015-01-18 11:17:27 -05:00
// at this level. For example, frontend pressure (on decode or fetch) due to
2013-04-08 14:41:23 -04:00
// code size, or the number and capabilities of the execution ports.
//
// We use the following heuristics to select the unroll factor:
2015-01-18 11:17:27 -05:00
// 1. If the code has reductions, then we unroll in order to break the cross
2013-04-08 14:41:23 -04:00
// iteration dependency.
2015-01-18 11:17:27 -05:00
// 2. If the loop is really small, then we unroll in order to reduce the loop
2013-04-08 14:41:23 -04:00
// overhead.
// 3. We don't unroll if we think that we will spill registers to memory due
// to the increased register pressure.
// Use the user preference, unless 'auto' is selected.
2015-01-18 11:17:27 -05:00
int UserUF = Hints - > getInterleave ( ) ;
2013-04-08 14:41:23 -04:00
if ( UserUF ! = 0 )
return UserUF ;
2015-01-18 11:17:27 -05:00
// When we optimize for size, we don't unroll.
2013-04-08 14:41:23 -04:00
if ( OptForSize )
return 1 ;
2013-12-21 19:04:03 -05:00
// We used the distance for the unroll factor.
if ( Legal - > getMaxSafeDepDistBytes ( ) ! = - 1U )
return 1 ;
2013-04-08 14:41:23 -04:00
// Do not unroll loops with a relatively small trip count.
2015-01-18 11:17:27 -05:00
unsigned TC = SE - > getSmallConstantTripCount ( TheLoop ) ;
2013-04-08 14:41:23 -04:00
if ( TC > 1 & & TC < TinyTripCountUnrollThreshold )
return 1 ;
2014-11-24 04:08:18 -05:00
unsigned TargetNumRegisters = TTI . getNumberOfRegisters ( VF > 1 ) ;
DEBUG ( dbgs ( ) < < " LV: The target has " < < TargetNumRegisters < <
" registers \n " ) ;
if ( VF = = 1 ) {
if ( ForceTargetNumScalarRegs . getNumOccurrences ( ) > 0 )
TargetNumRegisters = ForceTargetNumScalarRegs ;
} else {
if ( ForceTargetNumVectorRegs . getNumOccurrences ( ) > 0 )
TargetNumRegisters = ForceTargetNumVectorRegs ;
}
2013-04-08 14:41:23 -04:00
LoopVectorizationCostModel : : RegisterUsage R = calculateRegisterUsage ( ) ;
// We divide by these constants so assume that we have at least one
// instruction that uses at least one register.
R . MaxLocalUsers = std : : max ( R . MaxLocalUsers , 1U ) ;
R . NumInstructions = std : : max ( R . NumInstructions , 1U ) ;
// We calculate the unroll factor using the following formula.
// Subtract the number of loop invariants from the number of available
// registers. These registers are used by all of the unrolled instances.
// Next, divide the remaining registers by the number of registers that is
// required by the loop, in order to estimate how many parallel instances
2014-11-24 04:08:18 -05:00
// fit without causing spills. All of this is rounded down if necessary to be
// a power of two. We want power of two unroll factors to simplify any
// addressing operations or alignment considerations.
unsigned UF = PowerOf2Floor ( ( TargetNumRegisters - R . LoopInvariantRegs ) /
R . MaxLocalUsers ) ;
// Don't count the induction variable as unrolled.
if ( EnableIndVarRegisterHeur )
UF = PowerOf2Floor ( ( TargetNumRegisters - R . LoopInvariantRegs - 1 ) /
std : : max ( 1U , ( R . MaxLocalUsers - 1 ) ) ) ;
2013-04-08 14:41:23 -04:00
// Clamp the unroll factor ranges to reasonable factors.
2015-01-18 11:17:27 -05:00
unsigned MaxInterleaveSize = TTI . getMaxInterleaveFactor ( ) ;
2013-04-08 14:41:23 -04:00
2014-11-24 04:08:18 -05:00
// Check if the user has overridden the unroll max.
if ( VF = = 1 ) {
2015-01-18 11:17:27 -05:00
if ( ForceTargetMaxScalarInterleaveFactor . getNumOccurrences ( ) > 0 )
MaxInterleaveSize = ForceTargetMaxScalarInterleaveFactor ;
2014-11-24 04:08:18 -05:00
} else {
2015-01-18 11:17:27 -05:00
if ( ForceTargetMaxVectorInterleaveFactor . getNumOccurrences ( ) > 0 )
MaxInterleaveSize = ForceTargetMaxVectorInterleaveFactor ;
2014-11-24 04:08:18 -05:00
}
2013-04-08 14:41:23 -04:00
// If we did not calculate the cost for VF (because the user selected the VF)
// then we calculate the cost of VF here.
if ( LoopCost = = 0 )
LoopCost = expectedCost ( VF ) ;
// Clamp the calculated UF to be between the 1 and the max unroll factor
// that the target allows.
2015-01-18 11:17:27 -05:00
if ( UF > MaxInterleaveSize )
UF = MaxInterleaveSize ;
2013-04-08 14:41:23 -04:00
else if ( UF < 1 )
UF = 1 ;
2014-11-24 04:08:18 -05:00
// Unroll if we vectorized this loop and there is a reduction that could
// benefit from unrolling.
if ( VF > 1 & & Legal - > getReductionVars ( ) - > size ( ) ) {
2013-12-21 19:04:03 -05:00
DEBUG ( dbgs ( ) < < " LV: Unrolling because of reductions. \n " ) ;
2013-04-08 14:41:23 -04:00
return UF ;
}
2014-11-24 04:08:18 -05:00
// Note that if we've already vectorized the loop we will have done the
// runtime check and so unrolling won't require further checks.
bool UnrollingRequiresRuntimePointerCheck =
( VF = = 1 & & Legal - > getRuntimePointerCheck ( ) - > Need ) ;
// We want to unroll small loops in order to reduce the loop overhead and
// potentially expose ILP opportunities.
2013-12-21 19:04:03 -05:00
DEBUG ( dbgs ( ) < < " LV: Loop cost is " < < LoopCost < < ' \n ' ) ;
2014-11-24 04:08:18 -05:00
if ( ! UnrollingRequiresRuntimePointerCheck & &
LoopCost < SmallLoopCost ) {
// We assume that the cost overhead is 1 and we use the cost model
// to estimate the cost of the loop and unroll until the cost of the
// loop overhead is about 5% of the cost of the loop.
unsigned SmallUF = std : : min ( UF , ( unsigned ) PowerOf2Floor ( SmallLoopCost / LoopCost ) ) ;
// Unroll until store/load ports (estimated by max unroll factor) are
// saturated.
unsigned StoresUF = UF / ( Legal - > NumStores ? Legal - > NumStores : 1 ) ;
unsigned LoadsUF = UF / ( Legal - > NumLoads ? Legal - > NumLoads : 1 ) ;
2015-01-18 11:17:27 -05:00
// If we have a scalar reduction (vector reductions are already dealt with
// by this point), we can increase the critical path length if the loop
// we're unrolling is inside another loop. Limit, by default to 2, so the
// critical path only gets increased by one reduction operation.
if ( Legal - > getReductionVars ( ) - > size ( ) & &
TheLoop - > getLoopDepth ( ) > 1 ) {
unsigned F = static_cast < unsigned > ( MaxNestedScalarReductionUF ) ;
SmallUF = std : : min ( SmallUF , F ) ;
StoresUF = std : : min ( StoresUF , F ) ;
LoadsUF = std : : min ( LoadsUF , F ) ;
}
2014-11-24 04:08:18 -05:00
if ( EnableLoadStoreRuntimeUnroll & & std : : max ( StoresUF , LoadsUF ) > SmallUF ) {
DEBUG ( dbgs ( ) < < " LV: Unrolling to saturate store or load ports. \n " ) ;
return std : : max ( StoresUF , LoadsUF ) ;
}
2013-12-21 19:04:03 -05:00
DEBUG ( dbgs ( ) < < " LV: Unrolling to reduce branch cost. \n " ) ;
2014-11-24 04:08:18 -05:00
return SmallUF ;
2013-04-08 14:41:23 -04:00
}
2013-12-21 19:04:03 -05:00
DEBUG ( dbgs ( ) < < " LV: Not Unrolling. \n " ) ;
2013-04-08 14:41:23 -04:00
return 1 ;
}
LoopVectorizationCostModel : : RegisterUsage
LoopVectorizationCostModel : : calculateRegisterUsage ( ) {
// This function calculates the register usage by measuring the highest number
// of values that are alive at a single location. Obviously, this is a very
// rough estimation. We scan the loop in a topological order in order and
// assign a number to each instruction. We use RPO to ensure that defs are
// met before their users. We assume that each instruction that has in-loop
// users starts an interval. We record every time that an in-loop value is
// used, so we have a list of the first and last occurrences of each
// instruction. Next, we transpose this data structure into a multi map that
// holds the list of intervals that *end* at a specific location. This multi
// map allows us to perform a linear search. We scan the instructions linearly
// and record each time that a new interval starts, by placing it in a set.
// If we find this value in the multi-map then we remove it from the set.
// The max register usage is the maximum size of the set.
// We also search for instructions that are defined outside the loop, but are
// used inside the loop. We need this number separately from the max-interval
// usage number because when we unroll, loop-invariant values do not take
// more register.
LoopBlocksDFS DFS ( TheLoop ) ;
DFS . perform ( LI ) ;
RegisterUsage R ;
R . NumInstructions = 0 ;
// Each 'key' in the map opens a new interval. The values
// of the map are the index of the 'last seen' usage of the
// instruction that is the key.
typedef DenseMap < Instruction * , unsigned > IntervalMap ;
// Maps instruction to its index.
DenseMap < unsigned , Instruction * > IdxToInstr ;
// Marks the end of each interval.
IntervalMap EndPoint ;
// Saves the list of instruction indices that are used in the loop.
SmallSet < Instruction * , 8 > Ends ;
// Saves the list of values that are used in the loop but are
// defined outside the loop, such as arguments and constants.
SmallPtrSet < Value * , 8 > LoopInvariants ;
unsigned Index = 0 ;
for ( LoopBlocksDFS : : RPOIterator bb = DFS . beginRPO ( ) ,
be = DFS . endRPO ( ) ; bb ! = be ; + + bb ) {
R . NumInstructions + = ( * bb ) - > size ( ) ;
for ( BasicBlock : : iterator it = ( * bb ) - > begin ( ) , e = ( * bb ) - > end ( ) ; it ! = e ;
+ + it ) {
Instruction * I = it ;
IdxToInstr [ Index + + ] = I ;
// Save the end location of each USE.
for ( unsigned i = 0 ; i < I - > getNumOperands ( ) ; + + i ) {
Value * U = I - > getOperand ( i ) ;
Instruction * Instr = dyn_cast < Instruction > ( U ) ;
// Ignore non-instruction values such as arguments, constants, etc.
if ( ! Instr ) continue ;
// If this instruction is outside the loop then record it and continue.
if ( ! TheLoop - > contains ( Instr ) ) {
LoopInvariants . insert ( Instr ) ;
continue ;
}
// Overwrite previous end points.
EndPoint [ Instr ] = Index ;
Ends . insert ( Instr ) ;
}
}
}
// Saves the list of intervals that end with the index in 'key'.
typedef SmallVector < Instruction * , 2 > InstrList ;
DenseMap < unsigned , InstrList > TransposeEnds ;
// Transpose the EndPoints to a list of values that end at each index.
for ( IntervalMap : : iterator it = EndPoint . begin ( ) , e = EndPoint . end ( ) ;
it ! = e ; + + it )
TransposeEnds [ it - > second ] . push_back ( it - > first ) ;
SmallSet < Instruction * , 8 > OpenIntervals ;
unsigned MaxUsage = 0 ;
DEBUG ( dbgs ( ) < < " LV(REG): Calculating max register usage: \n " ) ;
for ( unsigned int i = 0 ; i < Index ; + + i ) {
Instruction * I = IdxToInstr [ i ] ;
// Ignore instructions that are never used within the loop.
if ( ! Ends . count ( I ) ) continue ;
2015-01-18 11:17:27 -05:00
// Ignore ephemeral values.
if ( EphValues . count ( I ) )
continue ;
2013-04-08 14:41:23 -04:00
// Remove all of the instructions that end at this location.
InstrList & List = TransposeEnds [ i ] ;
for ( unsigned int j = 0 , e = List . size ( ) ; j < e ; + + j )
OpenIntervals . erase ( List [ j ] ) ;
// Count the number of live interals.
MaxUsage = std : : max ( MaxUsage , OpenIntervals . size ( ) ) ;
DEBUG ( dbgs ( ) < < " LV(REG): At # " < < i < < " Interval # " < <
2013-12-21 19:04:03 -05:00
OpenIntervals . size ( ) < < ' \n ' ) ;
2013-04-08 14:41:23 -04:00
// Add the current instruction to the list of open intervals.
OpenIntervals . insert ( I ) ;
}
unsigned Invariant = LoopInvariants . size ( ) ;
2013-12-21 19:04:03 -05:00
DEBUG ( dbgs ( ) < < " LV(REG): Found max usage: " < < MaxUsage < < ' \n ' ) ;
DEBUG ( dbgs ( ) < < " LV(REG): Found invariant usage: " < < Invariant < < ' \n ' ) ;
DEBUG ( dbgs ( ) < < " LV(REG): LoopSize: " < < R . NumInstructions < < ' \n ' ) ;
2013-04-08 14:41:23 -04:00
R . LoopInvariantRegs = Invariant ;
R . MaxLocalUsers = MaxUsage ;
return R ;
}
unsigned LoopVectorizationCostModel : : expectedCost ( unsigned VF ) {
2012-12-02 08:10:19 -05:00
unsigned Cost = 0 ;
2013-04-08 14:41:23 -04:00
// For each block.
for ( Loop : : block_iterator bb = TheLoop - > block_begin ( ) ,
be = TheLoop - > block_end ( ) ; bb ! = be ; + + bb ) {
unsigned BlockCost = 0 ;
BasicBlock * BB = * bb ;
// For each instruction in the old loop.
for ( BasicBlock : : iterator it = BB - > begin ( ) , e = BB - > end ( ) ; it ! = e ; + + it ) {
// Skip dbg intrinsics.
if ( isa < DbgInfoIntrinsic > ( it ) )
continue ;
2015-01-18 11:17:27 -05:00
// Ignore ephemeral values.
if ( EphValues . count ( it ) )
continue ;
2013-04-08 14:41:23 -04:00
unsigned C = getInstructionCost ( it , VF ) ;
2014-11-24 04:08:18 -05:00
// Check if we should override the cost.
if ( ForceTargetInstructionCost . getNumOccurrences ( ) > 0 )
C = ForceTargetInstructionCost ;
2013-12-21 19:04:03 -05:00
BlockCost + = C ;
DEBUG ( dbgs ( ) < < " LV: Found an estimated cost of " < < C < < " for VF " < <
VF < < " For instruction: " < < * it < < ' \n ' ) ;
2013-04-08 14:41:23 -04:00
}
// We assume that if-converted blocks have a 50% chance of being executed.
// When the code is scalar then some of the blocks are avoided due to CF.
// When the code is vectorized we execute all code paths.
2013-12-21 19:04:03 -05:00
if ( VF = = 1 & & Legal - > blockNeedsPredication ( * bb ) )
2013-04-08 14:41:23 -04:00
BlockCost / = 2 ;
Cost + = BlockCost ;
2012-12-02 08:10:19 -05:00
}
return Cost ;
}
2013-12-21 19:04:03 -05:00
/// \brief Check whether the address computation for a non-consecutive memory
/// access looks like an unlikely candidate for being merged into the indexing
/// mode.
///
/// We look for a GEP which has one index that is an induction variable and all
/// other indices are loop invariant. If the stride of this access is also
/// within a small bound we decide that this address computation can likely be
/// merged into the addressing mode.
/// In all other cases, we identify the address computation as complex.
static bool isLikelyComplexAddressComputation ( Value * Ptr ,
LoopVectorizationLegality * Legal ,
ScalarEvolution * SE ,
const Loop * TheLoop ) {
GetElementPtrInst * Gep = dyn_cast < GetElementPtrInst > ( Ptr ) ;
if ( ! Gep )
return true ;
// We are looking for a gep with all loop invariant indices except for one
// which should be an induction variable.
unsigned NumOperands = Gep - > getNumOperands ( ) ;
for ( unsigned i = 1 ; i < NumOperands ; + + i ) {
Value * Opd = Gep - > getOperand ( i ) ;
if ( ! SE - > isLoopInvariant ( SE - > getSCEV ( Opd ) , TheLoop ) & &
! Legal - > isInductionVariable ( Opd ) )
return true ;
}
// Now we know we have a GEP ptr, %inv, %ind, %inv. Make sure that the step
// can likely be merged into the address computation.
unsigned MaxMergeDistance = 64 ;
const SCEVAddRecExpr * AddRec = dyn_cast < SCEVAddRecExpr > ( SE - > getSCEV ( Ptr ) ) ;
if ( ! AddRec )
return true ;
// Check the step is constant.
const SCEV * Step = AddRec - > getStepRecurrence ( * SE ) ;
// Calculate the pointer stride and check if it is consecutive.
const SCEVConstant * C = dyn_cast < SCEVConstant > ( Step ) ;
if ( ! C )
return true ;
const APInt & APStepVal = C - > getValue ( ) - > getValue ( ) ;
// Huge step value - give up.
if ( APStepVal . getBitWidth ( ) > 64 )
return true ;
int64_t StepVal = APStepVal . getSExtValue ( ) ;
return StepVal > MaxMergeDistance ;
}
2014-11-24 04:08:18 -05:00
static bool isStrideMul ( Instruction * I , LoopVectorizationLegality * Legal ) {
if ( Legal - > hasStride ( I - > getOperand ( 0 ) ) | | Legal - > hasStride ( I - > getOperand ( 1 ) ) )
return true ;
return false ;
}
2012-12-02 08:10:19 -05:00
unsigned
LoopVectorizationCostModel : : getInstructionCost ( Instruction * I , unsigned VF ) {
// If we know that this instruction will remain uniform, check the cost of
// the scalar version.
if ( Legal - > isUniformAfterVectorization ( I ) )
VF = 1 ;
Type * RetTy = I - > getType ( ) ;
Type * VectorTy = ToVectorTy ( RetTy , VF ) ;
// TODO: We need to estimate the cost of intrinsic calls.
switch ( I - > getOpcode ( ) ) {
2013-04-08 14:41:23 -04:00
case Instruction : : GetElementPtr :
// We mark this instruction as zero-cost because the cost of GEPs in
// vectorized code depends on whether the corresponding memory instruction
// is scalarized or not. Therefore, we handle GEPs with the memory
// instruction cost.
return 0 ;
case Instruction : : Br : {
return TTI . getCFInstrCost ( I - > getOpcode ( ) ) ;
}
case Instruction : : PHI :
//TODO: IF-converted IFs become selects.
return 0 ;
case Instruction : : Add :
case Instruction : : FAdd :
case Instruction : : Sub :
case Instruction : : FSub :
case Instruction : : Mul :
case Instruction : : FMul :
case Instruction : : UDiv :
case Instruction : : SDiv :
case Instruction : : FDiv :
case Instruction : : URem :
case Instruction : : SRem :
case Instruction : : FRem :
case Instruction : : Shl :
case Instruction : : LShr :
case Instruction : : AShr :
case Instruction : : And :
case Instruction : : Or :
case Instruction : : Xor : {
2014-11-24 04:08:18 -05:00
// Since we will replace the stride by 1 the multiplication should go away.
if ( I - > getOpcode ( ) = = Instruction : : Mul & & isStrideMul ( I , Legal ) )
return 0 ;
2013-04-08 14:41:23 -04:00
// Certain instructions can be cheaper to vectorize if they have a constant
// second vector operand. One example of this are shifts on x86.
TargetTransformInfo : : OperandValueKind Op1VK =
TargetTransformInfo : : OK_AnyValue ;
TargetTransformInfo : : OperandValueKind Op2VK =
TargetTransformInfo : : OK_AnyValue ;
2015-01-18 11:17:27 -05:00
TargetTransformInfo : : OperandValueProperties Op1VP =
TargetTransformInfo : : OP_None ;
TargetTransformInfo : : OperandValueProperties Op2VP =
TargetTransformInfo : : OP_None ;
2014-11-24 04:08:18 -05:00
Value * Op2 = I - > getOperand ( 1 ) ;
2013-04-08 14:41:23 -04:00
2014-11-24 04:08:18 -05:00
// Check for a splat of a constant or for a non uniform vector of constants.
2015-01-18 11:17:27 -05:00
if ( isa < ConstantInt > ( Op2 ) ) {
ConstantInt * CInt = cast < ConstantInt > ( Op2 ) ;
if ( CInt & & CInt - > getValue ( ) . isPowerOf2 ( ) )
Op2VP = TargetTransformInfo : : OP_PowerOf2 ;
2013-04-08 14:41:23 -04:00
Op2VK = TargetTransformInfo : : OK_UniformConstantValue ;
2015-01-18 11:17:27 -05:00
} else if ( isa < ConstantVector > ( Op2 ) | | isa < ConstantDataVector > ( Op2 ) ) {
2014-11-24 04:08:18 -05:00
Op2VK = TargetTransformInfo : : OK_NonUniformConstantValue ;
2015-01-18 11:17:27 -05:00
Constant * SplatValue = cast < Constant > ( Op2 ) - > getSplatValue ( ) ;
if ( SplatValue ) {
ConstantInt * CInt = dyn_cast < ConstantInt > ( SplatValue ) ;
if ( CInt & & CInt - > getValue ( ) . isPowerOf2 ( ) )
Op2VP = TargetTransformInfo : : OP_PowerOf2 ;
2014-11-24 04:08:18 -05:00
Op2VK = TargetTransformInfo : : OK_UniformConstantValue ;
2015-01-18 11:17:27 -05:00
}
2014-11-24 04:08:18 -05:00
}
2013-04-08 14:41:23 -04:00
2015-01-18 11:17:27 -05:00
return TTI . getArithmeticInstrCost ( I - > getOpcode ( ) , VectorTy , Op1VK , Op2VK ,
Op1VP , Op2VP ) ;
2013-04-08 14:41:23 -04:00
}
case Instruction : : Select : {
SelectInst * SI = cast < SelectInst > ( I ) ;
const SCEV * CondSCEV = SE - > getSCEV ( SI - > getCondition ( ) ) ;
bool ScalarCond = ( SE - > isLoopInvariant ( CondSCEV , TheLoop ) ) ;
Type * CondTy = SI - > getCondition ( ) - > getType ( ) ;
if ( ! ScalarCond )
CondTy = VectorType : : get ( CondTy , VF ) ;
return TTI . getCmpSelInstrCost ( I - > getOpcode ( ) , VectorTy , CondTy ) ;
}
case Instruction : : ICmp :
case Instruction : : FCmp : {
Type * ValTy = I - > getOperand ( 0 ) - > getType ( ) ;
VectorTy = ToVectorTy ( ValTy , VF ) ;
return TTI . getCmpSelInstrCost ( I - > getOpcode ( ) , VectorTy ) ;
}
case Instruction : : Store :
case Instruction : : Load : {
StoreInst * SI = dyn_cast < StoreInst > ( I ) ;
LoadInst * LI = dyn_cast < LoadInst > ( I ) ;
Type * ValTy = ( SI ? SI - > getValueOperand ( ) - > getType ( ) :
LI - > getType ( ) ) ;
VectorTy = ToVectorTy ( ValTy , VF ) ;
unsigned Alignment = SI ? SI - > getAlignment ( ) : LI - > getAlignment ( ) ;
unsigned AS = SI ? SI - > getPointerAddressSpace ( ) :
LI - > getPointerAddressSpace ( ) ;
Value * Ptr = SI ? SI - > getPointerOperand ( ) : LI - > getPointerOperand ( ) ;
// We add the cost of address computation here instead of with the gep
// instruction because only here we know whether the operation is
// scalarized.
if ( VF = = 1 )
return TTI . getAddressComputationCost ( VectorTy ) +
TTI . getMemoryOpCost ( I - > getOpcode ( ) , VectorTy , Alignment , AS ) ;
// Scalarized loads/stores.
2013-06-10 16:36:52 -04:00
int ConsecutiveStride = Legal - > isConsecutivePtr ( Ptr ) ;
bool Reverse = ConsecutiveStride < 0 ;
unsigned ScalarAllocatedSize = DL - > getTypeAllocSize ( ValTy ) ;
unsigned VectorElementSize = DL - > getTypeStoreSize ( VectorTy ) / VF ;
if ( ! ConsecutiveStride | | ScalarAllocatedSize ! = VectorElementSize ) {
2013-12-21 19:04:03 -05:00
bool IsComplexComputation =
isLikelyComplexAddressComputation ( Ptr , Legal , SE , TheLoop ) ;
2013-04-08 14:41:23 -04:00
unsigned Cost = 0 ;
// The cost of extracting from the value vector and pointer vector.
Type * PtrTy = ToVectorTy ( Ptr - > getType ( ) , VF ) ;
for ( unsigned i = 0 ; i < VF ; + + i ) {
// The cost of extracting the pointer operand.
Cost + = TTI . getVectorInstrCost ( Instruction : : ExtractElement , PtrTy , i ) ;
// In case of STORE, the cost of ExtractElement from the vector.
// In case of LOAD, the cost of InsertElement into the returned
// vector.
Cost + = TTI . getVectorInstrCost ( SI ? Instruction : : ExtractElement :
Instruction : : InsertElement ,
VectorTy , i ) ;
2012-12-02 08:10:19 -05:00
}
2013-04-08 14:41:23 -04:00
// The cost of the scalar loads/stores.
2013-12-21 19:04:03 -05:00
Cost + = VF * TTI . getAddressComputationCost ( PtrTy , IsComplexComputation ) ;
2013-04-08 14:41:23 -04:00
Cost + = VF * TTI . getMemoryOpCost ( I - > getOpcode ( ) , ValTy - > getScalarType ( ) ,
Alignment , AS ) ;
return Cost ;
2012-12-02 08:10:19 -05:00
}
2013-04-08 14:41:23 -04:00
// Wide load/stores.
unsigned Cost = TTI . getAddressComputationCost ( VectorTy ) ;
Cost + = TTI . getMemoryOpCost ( I - > getOpcode ( ) , VectorTy , Alignment , AS ) ;
2012-12-02 08:10:19 -05:00
2013-04-08 14:41:23 -04:00
if ( Reverse )
Cost + = TTI . getShuffleCost ( TargetTransformInfo : : SK_Reverse ,
VectorTy , 0 ) ;
return Cost ;
}
case Instruction : : ZExt :
case Instruction : : SExt :
case Instruction : : FPToUI :
case Instruction : : FPToSI :
case Instruction : : FPExt :
case Instruction : : PtrToInt :
case Instruction : : IntToPtr :
case Instruction : : SIToFP :
case Instruction : : UIToFP :
case Instruction : : Trunc :
case Instruction : : FPTrunc :
case Instruction : : BitCast : {
// We optimize the truncation of induction variable.
// The cost of these is the same as the scalar operation.
if ( I - > getOpcode ( ) = = Instruction : : Trunc & &
Legal - > isInductionVariable ( I - > getOperand ( 0 ) ) )
return TTI . getCastInstrCost ( I - > getOpcode ( ) , I - > getType ( ) ,
I - > getOperand ( 0 ) - > getType ( ) ) ;
Type * SrcVecTy = ToVectorTy ( I - > getOperand ( 0 ) - > getType ( ) , VF ) ;
return TTI . getCastInstrCost ( I - > getOpcode ( ) , VectorTy , SrcVecTy ) ;
}
case Instruction : : Call : {
CallInst * CI = cast < CallInst > ( I ) ;
Intrinsic : : ID ID = getIntrinsicIDForCall ( CI , TLI ) ;
assert ( ID & & " Not an intrinsic call! " ) ;
Type * RetTy = ToVectorTy ( CI - > getType ( ) , VF ) ;
SmallVector < Type * , 4 > Tys ;
for ( unsigned i = 0 , ie = CI - > getNumArgOperands ( ) ; i ! = ie ; + + i )
Tys . push_back ( ToVectorTy ( CI - > getArgOperand ( i ) - > getType ( ) , VF ) ) ;
return TTI . getIntrinsicInstrCost ( ID , RetTy , Tys ) ;
}
default : {
// We are scalarizing the instruction. Return the cost of the scalar
// instruction, plus the cost of insert and extract into vector
// elements, times the vector width.
unsigned Cost = 0 ;
if ( ! RetTy - > isVoidTy ( ) & & VF ! = 1 ) {
unsigned InsCost = TTI . getVectorInstrCost ( Instruction : : InsertElement ,
VectorTy ) ;
unsigned ExtCost = TTI . getVectorInstrCost ( Instruction : : ExtractElement ,
VectorTy ) ;
2012-12-02 08:10:19 -05:00
// The cost of inserting the results plus extracting each one of the
// operands.
Cost + = VF * ( InsCost + ExtCost * I - > getNumOperands ( ) ) ;
}
2013-04-08 14:41:23 -04:00
// The cost of executing VF copies of the scalar instruction. This opcode
// is unknown. Assume that it is the same as 'mul'.
Cost + = VF * TTI . getArithmeticInstrCost ( Instruction : : Mul , VectorTy ) ;
return Cost ;
}
2012-12-02 08:10:19 -05:00
} // end of switch.
}
Type * LoopVectorizationCostModel : : ToVectorTy ( Type * Scalar , unsigned VF ) {
if ( Scalar - > isVoidTy ( ) | | VF = = 1 )
return Scalar ;
return VectorType : : get ( Scalar , VF ) ;
}
char LoopVectorize : : ID = 0 ;
static const char lv_name [ ] = " Loop Vectorization " ;
INITIALIZE_PASS_BEGIN ( LoopVectorize , LV_NAME , lv_name , false , false )
2013-04-08 14:41:23 -04:00
INITIALIZE_AG_DEPENDENCY ( TargetTransformInfo )
2014-11-24 04:08:18 -05:00
INITIALIZE_AG_DEPENDENCY ( AliasAnalysis )
2015-01-18 11:17:27 -05:00
INITIALIZE_PASS_DEPENDENCY ( AssumptionCacheTracker )
2014-11-24 04:08:18 -05:00
INITIALIZE_PASS_DEPENDENCY ( BlockFrequencyInfo )
INITIALIZE_PASS_DEPENDENCY ( DominatorTreeWrapperPass )
2012-12-02 08:10:19 -05:00
INITIALIZE_PASS_DEPENDENCY ( ScalarEvolution )
2013-12-21 19:04:03 -05:00
INITIALIZE_PASS_DEPENDENCY ( LCSSA )
INITIALIZE_PASS_DEPENDENCY ( LoopInfo )
2012-12-02 08:10:19 -05:00
INITIALIZE_PASS_DEPENDENCY ( LoopSimplify )
INITIALIZE_PASS_END ( LoopVectorize , LV_NAME , lv_name , false , false )
namespace llvm {
2014-11-24 04:08:18 -05:00
Pass * createLoopVectorizePass ( bool NoUnrolling , bool AlwaysVectorize ) {
return new LoopVectorize ( NoUnrolling , AlwaysVectorize ) ;
2012-12-02 08:10:19 -05:00
}
}
2013-04-08 14:41:23 -04:00
bool LoopVectorizationCostModel : : isConsecutiveLoadOrStore ( Instruction * Inst ) {
// Check for a store.
if ( StoreInst * ST = dyn_cast < StoreInst > ( Inst ) )
return Legal - > isConsecutivePtr ( ST - > getPointerOperand ( ) ) ! = 0 ;
// Check for a load.
if ( LoadInst * LI = dyn_cast < LoadInst > ( Inst ) )
return Legal - > isConsecutivePtr ( LI - > getPointerOperand ( ) ) ! = 0 ;
return false ;
}
2013-12-21 19:04:03 -05:00
2014-11-24 04:08:18 -05:00
void InnerLoopUnroller : : scalarizeInstruction ( Instruction * Instr ,
bool IfPredicateStore ) {
2013-12-21 19:04:03 -05:00
assert ( ! Instr - > getType ( ) - > isAggregateType ( ) & & " Can't handle vectors " ) ;
// Holds vector parameters or scalars, in case of uniform vals.
SmallVector < VectorParts , 4 > Params ;
setDebugLocFromInst ( Builder , Instr ) ;
// Find all of the vectorized parameters.
for ( unsigned op = 0 , e = Instr - > getNumOperands ( ) ; op ! = e ; + + op ) {
Value * SrcOp = Instr - > getOperand ( op ) ;
// If we are accessing the old induction variable, use the new one.
if ( SrcOp = = OldInduction ) {
Params . push_back ( getVectorValue ( SrcOp ) ) ;
continue ;
}
// Try using previously calculated values.
Instruction * SrcInst = dyn_cast < Instruction > ( SrcOp ) ;
// If the src is an instruction that appeared earlier in the basic block
// then it should already be vectorized.
if ( SrcInst & & OrigLoop - > contains ( SrcInst ) ) {
assert ( WidenMap . has ( SrcInst ) & & " Source operand is unavailable " ) ;
// The parameter is a vector value from earlier.
Params . push_back ( WidenMap . get ( SrcInst ) ) ;
} else {
// The parameter is a scalar from outside the loop. Maybe even a constant.
VectorParts Scalars ;
Scalars . append ( UF , SrcOp ) ;
Params . push_back ( Scalars ) ;
}
}
assert ( Params . size ( ) = = Instr - > getNumOperands ( ) & &
" Invalid number of operands " ) ;
// Does this instruction return a value ?
bool IsVoidRetTy = Instr - > getType ( ) - > isVoidTy ( ) ;
2014-11-24 04:08:18 -05:00
Value * UndefVec = IsVoidRetTy ? nullptr :
2013-12-21 19:04:03 -05:00
UndefValue : : get ( Instr - > getType ( ) ) ;
// Create a new entry in the WidenMap and initialize it to Undef or Null.
VectorParts & VecResults = WidenMap . splat ( Instr , UndefVec ) ;
2014-11-24 04:08:18 -05:00
Instruction * InsertPt = Builder . GetInsertPoint ( ) ;
BasicBlock * IfBlock = Builder . GetInsertBlock ( ) ;
BasicBlock * CondBlock = nullptr ;
VectorParts Cond ;
Loop * VectorLp = nullptr ;
if ( IfPredicateStore ) {
assert ( Instr - > getParent ( ) - > getSinglePredecessor ( ) & &
" Only support single predecessor blocks " ) ;
Cond = createEdgeMask ( Instr - > getParent ( ) - > getSinglePredecessor ( ) ,
Instr - > getParent ( ) ) ;
VectorLp = LI - > getLoopFor ( IfBlock ) ;
assert ( VectorLp & & " Must have a loop for this block " ) ;
}
2013-12-21 19:04:03 -05:00
// For each vector unroll 'part':
for ( unsigned Part = 0 ; Part < UF ; + + Part ) {
// For each scalar that we create:
2014-11-24 04:08:18 -05:00
// Start an "if (pred) a[i] = ..." block.
Value * Cmp = nullptr ;
if ( IfPredicateStore ) {
if ( Cond [ Part ] - > getType ( ) - > isVectorTy ( ) )
Cond [ Part ] =
Builder . CreateExtractElement ( Cond [ Part ] , Builder . getInt32 ( 0 ) ) ;
Cmp = Builder . CreateICmp ( ICmpInst : : ICMP_EQ , Cond [ Part ] ,
ConstantInt : : get ( Cond [ Part ] - > getType ( ) , 1 ) ) ;
CondBlock = IfBlock - > splitBasicBlock ( InsertPt , " cond.store " ) ;
LoopVectorBody . push_back ( CondBlock ) ;
VectorLp - > addBasicBlockToLoop ( CondBlock , LI - > getBase ( ) ) ;
// Update Builder with newly created basic block.
Builder . SetInsertPoint ( InsertPt ) ;
}
2013-12-21 19:04:03 -05:00
Instruction * Cloned = Instr - > clone ( ) ;
if ( ! IsVoidRetTy )
Cloned - > setName ( Instr - > getName ( ) + " .cloned " ) ;
// Replace the operands of the cloned instructions with extracted scalars.
for ( unsigned op = 0 , e = Instr - > getNumOperands ( ) ; op ! = e ; + + op ) {
Value * Op = Params [ op ] [ Part ] ;
Cloned - > setOperand ( op , Op ) ;
}
// Place the cloned scalar in the new loop.
Builder . Insert ( Cloned ) ;
// If the original scalar returns a value we need to place it in a vector
// so that future users will be able to use it.
if ( ! IsVoidRetTy )
VecResults [ Part ] = Cloned ;
2014-11-24 04:08:18 -05:00
// End if-block.
if ( IfPredicateStore ) {
BasicBlock * NewIfBlock = CondBlock - > splitBasicBlock ( InsertPt , " else " ) ;
LoopVectorBody . push_back ( NewIfBlock ) ;
VectorLp - > addBasicBlockToLoop ( NewIfBlock , LI - > getBase ( ) ) ;
Builder . SetInsertPoint ( InsertPt ) ;
Instruction * OldBr = IfBlock - > getTerminator ( ) ;
BranchInst : : Create ( CondBlock , NewIfBlock , Cmp , OldBr ) ;
OldBr - > eraseFromParent ( ) ;
IfBlock = NewIfBlock ;
}
2013-12-21 19:04:03 -05:00
}
}
2014-11-24 04:08:18 -05:00
void InnerLoopUnroller : : vectorizeMemoryInstruction ( Instruction * Instr ) {
StoreInst * SI = dyn_cast < StoreInst > ( Instr ) ;
bool IfPredicateStore = ( SI & & Legal - > blockNeedsPredication ( SI - > getParent ( ) ) ) ;
return scalarizeInstruction ( Instr , IfPredicateStore ) ;
2013-12-21 19:04:03 -05:00
}
Value * InnerLoopUnroller : : reverseVector ( Value * Vec ) {
return Vec ;
}
Value * InnerLoopUnroller : : getBroadcastInstrs ( Value * V ) {
return V ;
}
Value * InnerLoopUnroller : : getConsecutiveVector ( Value * Val , int StartIdx ,
bool Negate ) {
// When unrolling and the VF is 1, we only need to add a simple scalar.
Type * ITy = Val - > getType ( ) ;
assert ( ! ITy - > isVectorTy ( ) & & " Val must be a scalar " ) ;
Constant * C = ConstantInt : : get ( ITy , StartIdx , Negate ) ;
return Builder . CreateAdd ( Val , C , " induction " ) ;
}