Combined commit for 2 major changes

* Added pipeline_sim for simulating pipeline, not completely tested * Added Cache HW Mod for Cortex A5. Signed-off-by: Gaurav Kukreja <gaurav@gauravk.in>

Combined commit for 2 major changes
* Added pipeline_sim for simulating pipeline, not completely tested * Added Cache HW Mod for Cortex A5. Signed-off-by: Gaurav Kukreja <gaurav@gauravk.in>
635059af · Gaurav Kukreja · 70b806db · 635059af · 635059af · 635059af
Commit 635059af authored Sep 21, 2014 by Gaurav Kukreja
11 changed files
--- a/cache_simulator/Makefile.macros
+++ b/cache_simulator/Makefile.macros
@@ -10,7 +10,7 @@ current_dir := $(patsubst %/,%,$(dir $(mkfile_path)))
 CSIM_DIR = $(current_dir)

 # Hardware Model to use
-CACHESIM_HWMOD = generic
+CACHESIM_HWMOD = cortexA5

 CACHESIM_SRC = $(CSIM_DIR)/src
 CACHESIM_HEADERS = $(CSIM_DIR)/headers/

--- a/cache_simulator/lib/libcacheSim.so
+++ b/cache_simulator/lib/libcacheSim.so
--- a/cache_simulator/src/Makefile
+++ b/cache_simulator/src/Makefile
@@ -15,6 +15,11 @@ ifeq ($(CACHESIM_HWMOD),generic)
    OBJECTS += genericHwMod.o
 endif

+ifeq ($(CACHESIM_HWMOD),cortexA5)
+    SOURCES += cortexA5HwMod.c
+    OBJECTS += cortexA5HwMod.o
+endif
+
 all: cacheSim

 cacheSim: $(SOURCES)

--- a/cache_simulator/src/cortexA5HwMod.c
+++ b/cache_simulator/src/cortexA5HwMod.c
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "cacheSimHwMod.h"
+
+#define CACHELINE_VALID_BIT (1 << 0)
+#define IS_CACHELINE_VALID(flags) (flags & CACHELINE_VALID_BIT)
+#define SET_CACHELINE_VALID(flags) (flags |= CACHELINE_VALID_BIT)
+#define SET_CACHELINE_INVALID(flags) (flags &= ~CACHELINE_VALID_BIT)
+
+#define CACHELINE_DIRTY_BIT (1 << 1)
+#define IS_CACHELINE_DIRTY(flags) (flags & CACHELINE_DIRTY_BIT)
+#define SET_CACHELINE_DIRTY(flags) (flags |= CACHELINE_DIRTY_BIT)
+#define SET_CACHELINE_CLEAN(flags) (flags &= ~CACHELINE_DIRTY_BIT)
+
+#define ADDRESS_LEN_BITS 32
+
+/**** DATA STRUCTURES *********************************************************/
+
+struct cacheConfig
+{
+	// Size
+	unsigned int lineLenBytes;
+	unsigned int cacheSizeBytes;
+	unsigned int numSets;
+
+	// Derived
+	unsigned int numLines;
+	unsigned long tagMask;
+	unsigned int tagLenBits;
+	unsigned long indexMask;
+	unsigned int indexLenBits;
+
+	// Features
+	unsigned int isWriteThrough;
+
+	// Latencies
+//	unsigned int hitLat;
+//	unsigned int missLat;
+	unsigned int hitLatency;
+	unsigned int missLatency;
+
+};
+typedef struct cacheConfig cacheConfig_t;
+
+/**
+ * Stores all data related to a cache line.
+ */
+struct cacheLine
+{
+	unsigned int flags;
+	unsigned long tag;
+};
+typedef struct cacheLine cacheLine_t;
+
+/**** GLOBAL VARIABLES ********************************************************/
+
+cacheConfig_t L1DCacheConf;
+cacheConfig_t L1ICacheConf;
+cacheConfig_t L2CacheConf;
+
+cacheLine_t **L1DCache;
+cacheLine_t **L1ICache;
+cacheLine_t **L2Cache;
+
+unsigned int memWriteLatency = 100;
+unsigned int memReadLatency = 100;
+
+unsigned long L1D_Hit_Read = 0;
+unsigned long L1D_Hit_Writeback = 0;
+unsigned long L1D_Hit_Writethrough = 0;
+unsigned long L1D_Miss = 0;
+unsigned long L1I_Hit_Read = 0;
+unsigned long L1I_Hit_Writeback = 0;
+unsigned long L1I_Hit_Writethrough = 0;
+unsigned long L1I_Miss = 0;
+unsigned long L2_Hit_Read = 0;
+unsigned long L2_Hit_Writeback = 0;
+unsigned long L2_Hit_Writethrough = 0;
+unsigned long L2_Miss = 0;
+
+/**** LOCAL FUNCTIONS *********************************************************/
+
+int log_base2(int val)
+{
+	int ret = 0;
+	while (val >>= 1) ++ret;
+	return ret;
+}
+
+void initCacheParams ()
+{
+	int subIndexLen = 0;
+	int i;
+
+	/*** L1 DCache *****************/
+
+	L1DCacheConf.lineLenBytes 		= 32;
+	L1DCacheConf.cacheSizeBytes 	= 4 * 1024; // 4 KB
+	L1DCacheConf.numSets 			= 4;
+
+	L1DCacheConf.numLines  			= L1DCacheConf.cacheSizeBytes /
+			(L1DCacheConf.lineLenBytes * L1DCacheConf.numSets);
+
+	subIndexLen = log_base2(L1DCacheConf.lineLenBytes);
+	L1DCacheConf.indexLenBits = log_base2(L1DCacheConf.numLines);
+	L1DCacheConf.indexMask = 0;
+	for (i = 0; i < L1DCacheConf.indexLenBits; i++)
+	{
+		L1DCacheConf.indexMask = L1DCacheConf.indexMask << 1;
+		L1DCacheConf.indexMask |= 0x00000001;
+	}
+	L1DCacheConf.indexMask = L1DCacheConf.indexMask << subIndexLen;
+
+	L1DCacheConf.tagLenBits = ADDRESS_LEN_BITS - L1DCacheConf.indexMask - subIndexLen;
+	L1DCacheConf.tagMask = 0;
+	for (i = 0; i < L1DCacheConf.tagLenBits; i++)
+	{
+		L1DCacheConf.tagMask = L1DCacheConf.tagMask << 1;
+		L1DCacheConf.tagMask |= 0x00000001;
+	}
+	L1DCacheConf.tagMask = L1DCacheConf.tagMask << (L1DCacheConf.indexLenBits + subIndexLen);
+
+	L1DCacheConf.isWriteThrough = 0;
+
+	L1DCacheConf.hitLatency = 2;
+	L1DCacheConf.missLatency = 2;
+
+
+	/*** L1 ICache *****************/
+
+	L1ICacheConf.lineLenBytes 		= 32;
+	L1ICacheConf.cacheSizeBytes 	= 4 * 1024; // 4 KB
+	L1ICacheConf.numSets 			= 2;
+
+	L1ICacheConf.numLines  			= L1ICacheConf.cacheSizeBytes /
+			(L1ICacheConf.lineLenBytes * L1ICacheConf.numSets);
+
+	subIndexLen = log_base2(L1ICacheConf.lineLenBytes);
+	L1ICacheConf.indexLenBits = log_base2(L1ICacheConf.numLines);
+	L1ICacheConf.indexMask = 0;
+	for (i = 0; i < L1ICacheConf.indexLenBits; i++)
+	{
+		L1ICacheConf.indexMask = L1ICacheConf.indexMask << 1;
+		L1ICacheConf.indexMask |= 0x00000001;
+	}
+	L1ICacheConf.indexMask = L1ICacheConf.indexMask << subIndexLen;
+
+	L1ICacheConf.tagLenBits = ADDRESS_LEN_BITS - L1ICacheConf.indexMask - subIndexLen;
+	L1ICacheConf.tagMask = 0;
+	for (i = 0; i < L1ICacheConf.tagLenBits; i++)
+	{
+		L1ICacheConf.tagMask = L1ICacheConf.tagMask << 1;
+		L1ICacheConf.tagMask |= 0x00000001;
+	}
+	L1ICacheConf.tagMask = L1ICacheConf.tagMask << (L1ICacheConf.indexLenBits + subIndexLen);
+
+	L1ICacheConf.isWriteThrough = 0;
+
+	L1ICacheConf.hitLatency = 2;
+	L1ICacheConf.missLatency = 2;
+
+
+	/*** L2 Cache *****************/
+
+	L2CacheConf.lineLenBytes 		= 32;
+	L2CacheConf.cacheSizeBytes 		= 32 * 1024; // 32 KB
+	L2CacheConf.numSets 			= 2;
+
+	L2CacheConf.numLines  			= L2CacheConf.cacheSizeBytes /
+			(L2CacheConf.lineLenBytes * L2CacheConf.numSets);
+
+	subIndexLen = log_base2(L2CacheConf.lineLenBytes);
+	L2CacheConf.indexLenBits = log_base2(L2CacheConf.numLines);
+	L2CacheConf.indexMask = 0;
+	for (i = 0; i < L2CacheConf.indexLenBits; i++)
+	{
+		L2CacheConf.indexMask = L2CacheConf.indexMask << 1;
+		L2CacheConf.indexMask |= 0x00000001;
+	}
+	L2CacheConf.indexMask = L2CacheConf.indexMask << subIndexLen;
+
+	L2CacheConf.tagLenBits = ADDRESS_LEN_BITS - L2CacheConf.indexMask - subIndexLen;
+	L2CacheConf.tagMask = 0;
+	for (i = 0; i < L2CacheConf.tagLenBits; i++)
+	{
+		L2CacheConf.tagMask = L2CacheConf.tagMask << 1;
+		L2CacheConf.tagMask |= 0x00000001;
+	}
+	L2CacheConf.tagMask = L2CacheConf.tagMask << (L2CacheConf.indexLenBits + subIndexLen);
+
+	L2CacheConf.isWriteThrough = 0;
+
+	L2CacheConf.hitLatency = 14;
+	L2CacheConf.missLatency = 14;
+}
+
+
+/**
+ * Allocates a 2 dimensional array. To be used to allocate space for cache lines
+ *
+ * @param rows number of sets
+ * @param cols number of cache lines
+ * @param size size of the data structure to be stored
+ *
+ * @return pointer to array of pointers pointing to rows of data.
+ */
+void** alloc2D(unsigned int rows, unsigned int cols, size_t size)
+{
+	void** ret;
+	void *data;
+	int i;
+	size_t arrSize = (rows * sizeof(void*)) + (rows * cols * size);
+
+	ret = malloc(arrSize);
+	memset(ret, 0, arrSize);
+	data = (void*) (ret + rows);
+
+	for(i=0; i<rows; i++)
+	{
+		ret[i] = data + i * cols * size;
+	}
+
+	return ret;
+}
+
+inline unsigned long getTagFromAddress(unsigned long address,
+		unsigned int tagLengthBits, unsigned long tagMask)
+{
+	return (address & tagMask) >> (ADDRESS_LEN_BITS - tagLengthBits);
+}
+
+inline unsigned long getIndexFromAddress(unsigned long address,
+		unsigned int offsetLengthBits, unsigned long indexMask)
+{
+
+	return (address & indexMask) >> offsetLengthBits;
+}
+
+
+/**** HWMOD FUNCTIONS *********************************************************/
+
+unsigned long long cortexA5_simICache(unsigned long address,
+										   unsigned int nBytes)
+{
+	unsigned int latency = 0;
+	unsigned long tag;
+	unsigned long index;
+	int setIndex = 0;
+
+	tag = getTagFromAddress(address, L1ICacheConf.tagLenBits, L1ICacheConf.tagMask);
+	index = getIndexFromAddress(address, L1ICacheConf.indexLenBits, L1ICacheConf.indexMask);
+
+	for (setIndex = 0; setIndex < L1ICacheConf.numSets; setIndex++)
+	{
+		if (L1ICache[setIndex][index].tag == tag &&
+				IS_CACHELINE_VALID(L1ICache[setIndex][index].flags))
+		{
+			latency += L1ICacheConf.hitLatency;
+			L1I_Hit_Read++;
+			return latency;
+		}
+	}
+	// L1 Miss has occured!
+	L1I_Miss++;
+	latency += L1ICacheConf.missLatency;
+
+	tag = getTagFromAddress(address, L2CacheConf.tagLenBits, L2CacheConf.tagMask);
+	index = getIndexFromAddress(address, L2CacheConf.indexLenBits, L2CacheConf.indexMask);
+
+	for (setIndex = 0; setIndex < L2CacheConf.numSets; setIndex++)
+	{
+		if (L2Cache[setIndex][index].tag == tag &&
+				IS_CACHELINE_VALID(L2Cache[setIndex][index].flags))
+		{
+			latency += L2CacheConf.hitLatency;
+			L2_Hit_Read++;
+			return latency;
+		}
+	}
+
+	// L2 Miss has occured!
+	L2_Miss++;
+	latency += L2CacheConf.missLatency;
+	latency += memReadLatency;
+
+	return latency;
+}
+
+
+unsigned long long cortexA5_simDCache(unsigned long address,
+										   unsigned int isReadAccess)
+{
+	unsigned int latency = 0;
+	unsigned long tag;
+	unsigned long index;
+	int setIndex = 0;
+
+	if (isReadAccess == 0 && L1DCacheConf.isWriteThrough == 1) // Write Access
+	{
+		// Simply increment latency by time to write to memory
+		latency += memWriteLatency;
+		L1D_Hit_Writethrough++;
+	}
+	// For writeback, there is no latency. We can safely take this assumption,
+	//   as we are only using a Single Core System.
+
+	tag = getTagFromAddress(address, L1DCacheConf.tagLenBits, L1DCacheConf.tagMask);
+	index = getIndexFromAddress(address, L1DCacheConf.indexLenBits, L1DCacheConf.indexMask);
+
+	for (setIndex = 0; setIndex < L1DCacheConf.numSets; setIndex++)
+	{
+		if (L1DCache[setIndex][index].tag == tag &&
+				IS_CACHELINE_VALID(L1DCache[setIndex][index].flags))
+		{
+			latency += L1DCacheConf.hitLatency;
+			if (isReadAccess)
+				L1D_Hit_Read++;
+			else
+				L1D_Hit_Writeback++;
+			return latency;
+		}
+	}
+	// L1 Miss has occured!
+	L1D_Miss++;
+	latency += L1DCacheConf.missLatency;
+
+	tag = getTagFromAddress(address, L2CacheConf.tagLenBits, L2CacheConf.tagMask);
+	index = getIndexFromAddress(address, L2CacheConf.indexLenBits, L2CacheConf.indexMask);
+
+	for (setIndex = 0; setIndex < L2CacheConf.numSets; setIndex++)
+	{
+		if (L2Cache[setIndex][index].tag == tag &&
+				IS_CACHELINE_VALID(L2Cache[setIndex][index].flags))
+		{
+			latency += L2CacheConf.hitLatency;
+			if (isReadAccess)
+				L2_Hit_Read++;
+			else
+				L2_Hit_Writeback++;
+			return latency;
+		}
+	}
+
+	// L2 Miss has occured!
+	L2_Miss++;
+	latency += L2CacheConf.missLatency;
+	latency += memReadLatency;
+	return latency;
+}
+
+void cortexA5_cacheSimInit()
+{
+	// Allocate space for caches
+	initCacheParams();
+
+	L1DCache = (cacheLine_t **) alloc2D(L1DCacheConf.numSets,
+				L1DCacheConf.numLines, sizeof(cacheLine_t));
+	L1ICache = (cacheLine_t **) alloc2D(L1ICacheConf.numSets,
+				L1ICacheConf.numLines, sizeof(cacheLine_t));
+	L2Cache = (cacheLine_t **) alloc2D(L2CacheConf.numSets,
+				L2CacheConf.numLines, sizeof(cacheLine_t));
+
+	return;
+}
+
+void cortexA5_cacheSimFini()
+{
+	free(L1DCache);
+	free(L1ICache);
+	free(L2Cache);
+
+	printf("Statistics : \n");
+
+	printf("\nL1 Data Cache\n");
+	printf("\t Hit Read = %ld\n", L1D_Hit_Read);
+	printf("\t Hit Writeback = %ld\n", L1D_Hit_Writeback);
+	printf("\t Miss = %ld\n", L1D_Miss);
+
+	printf("\nL1 Instruction Cache\n");
+	printf("\t Hit Read = %ld\n", L1I_Hit_Read);
+	printf("\t Miss = %ld\n", L1I_Miss);
+
+	printf("\nL2 Unified Cache\n");
+	printf("\t Hit Read = %ld\n", L2_Hit_Read);
+	printf("\t Hit Writeback = %ld\n", L2_Hit_Writeback);
+	printf("\t Miss = %ld\n", L2_Miss);
+
+	return;
+}
+
+struct cacheSimHwMod_t hwMod = {
+		.simDCache = &cortexA5_simDCache,
+		.simICache = &cortexA5_simICache,
+		.cacheSimInit = &cortexA5_cacheSimInit,
+		.cacheSimFini = &cortexA5_cacheSimFini
+};
--- a/cache_simulator/src/libcacheSim.so
+++ b/cache_simulator/src/libcacheSim.so
--- a/instrument/annotation.py
+++ b/instrument/annotation.py
+import logging
+
+class Annotation:
+    def __init__(self, annotation, fileName, lineNum, replace = False):
+        self.fileName = fileName
+        self.lineNum = lineNum
+        self.annotation = annotation
+        self.replace = replace
+        
+    def debug(self):
+        logging.debug("%s:%d: %s" % (self.fileName, self.lineNum, self.annotation))
+        
+def debugDictAnnot(dictAnnot):
+    for lineNum in dictAnnot.iterkeys():
+        for annot in dictAnnot[lineNum]:
+            annot.debug()
+
+def addAnnotationToDict(dict, lineNum, annot):
+    if lineNum not in dict:
+        print("adding annotation on line %d" % lineNum)
+        dict[lineNum] = [annot]
+    else:
+        for a in dict[lineNum]:
+            if a.annotation == annot.annotation and a.fileName == annot.fileName:
+                return
+        dict[lineNum].append(annot)
\ No newline at end of file
--- a/instrument/arm_isa_regex.py
+++ b/instrument/arm_isa_regex.py
@@ -14,7 +14,7 @@ re_mvnInst = re.compile("\s*(?:mvn)s?(?:%s)?\s*(?P<destReg>%s),\s*(?:%s)(?:%s)"
                        (Cond, Reg, Operand2, EndLine))

 ArithOpcode = "(?P<arithOpcode>add|adc|sub|sbc|rsb|rsc|mul|mla)s?" # There are more that I have ignored for now
-re_arithInst = re.compile("\s*(?:%s)(?:%s)?\s*(?P<destReg>%s),\s*(?P<op1Reg>%s),\s*(?:%s)(?:%s)" % 
+re_arithInst = re.compile("\s*(?:%s)(?:%s)?\s*(?P<destReg>%s),\s*(?P<op1Reg>%s),\s*(?P<op2>%s)(?:%s)" % 
                          (ArithOpcode, Cond, Reg, Reg, Operand2, EndLine))

 ArithLongOpcode = "(?P<arithLongOpcode>umull|umlal|smull|smlal)"
@@ -25,7 +25,7 @@ LogicOpcode = "(?P<logicOpcode>and|eor|orr|bic)s?"
 re_logicInst = re.compile("\s*(?:%s)(?:%s)?\s*(?P<destReg>%s),\s*(?P<op1Reg>%s),\s*(?:%s)(?:%s)" % 
                          (LogicOpcode, Cond, Reg, Reg, Operand2, EndLine))

-re_shiftInst = re.compile("\s*(?:%s)(?:%s)?\s*(?P<destReg>%s),\s*(?P<op1Reg>%s),\s*#(?P<op2ImedVal>\d*)(?:%s)" %
+re_shiftInst = re.compile("\s*(?P<shiftOpcode>%s)(?:%s)?\s*(?P<destReg>%s),\s*(?P<op1Reg>%s),\s*#(?P<op2ImedVal>\d*)(?:%s)" %
                          (ShiftOpcode, Cond, Reg, Reg, EndLine))

 BranchOpcode = "(?P<branchOpcode>b|bl|bx|blx|bxj)"
@@ -41,8 +41,8 @@ AMode2_2 = "\[(?P<am2_2BaseReg>%s),\s*#(?P<am2_2ImedOff>-?\d*)\]" % (Reg)
 AMode2_3 = "\[(?P<am2_3BaseReg>%s),\s*(?P<am2_3OffsetReg>-?%s)\]" % (Reg, Reg)
 AMode2_4 = "\[(?P<am2_4BaseReg>%s),\s*(?P<am2_4OffsetReg>%s),\s*(?:%s)\s*#\d*\]" % (Reg, Reg, ShiftOpcode)
 AMode2_5 = "\[(?P<am2_5BaseReg>%s)\],\s*#(?P<am2_5ImedOff>-?\d*)" % (Reg)
-AMode2_6 = "\[(?P<am2_6BaseReg>%s)\],\s*-?(?:%s)" % (Reg, Reg)
-AMode2_7 = "\[(?P<am2_7BaseReg>%s)\],\s*(?:%s),\s*(?:%s)\s*#\d*" % (Reg, Reg, ShiftOpcode)
+AMode2_6 = "\[(?P<am2_6BaseReg>%s)\],\s*-?(?P<am2_6OffsetReg>%s)" % (Reg, Reg)
+AMode2_7 = "\[(?P<am2_7BaseReg>%s)\],\s*(?P<am2_7OffsetReg>%s),\s*(?:%s)\s*#\d*" % (Reg, Reg, ShiftOpcode)

 AMode2 = ("(?:%s)|(?:%s)|(?:%s)|(?:%s)|(?:%s)|(?:%s)|(?:%s)" % (AMode2_1, 
                                                                AMode2_2, 
@@ -59,7 +59,7 @@ re_loadInst = re.compile("\s*ldrs?(?:%s)?(?:%s)?\s*(?P<destReg>%s),\s*(?:%s)(?:%
 re_storeInst = re.compile("\s*strs?(?:%s)?(?:%s)?\s*(?P<destReg>%s),\s*(?:%s)(?:%s)" % 
                         (LoadStoreType, Cond, Reg, AMode2, EndLine))

-re_cmpInst = re.compile("\s*(?:cmp|cmn)\s*(?:%s),\s*(?:%s)(?:%s)" % 
+re_cmpInst = re.compile("\s*(?:cmp|cmn)\s*(?P<op1Reg>%s),\s*(?:%s)(?:%s)" % 
                        (Reg, Operand2, EndLine))

 re_pushInst = re.compile("\s*push\s*\{(?P<pushRegs>(?:%s)(?:,\s*(?:%s))*)\}(?:%s)" % 

--- a/instrument/examples/adpcm/instrumented/adpcm_IR.c
+++ b/instrument/examples/adpcm/instrumented/adpcm_IR.c
@@ -11,6 +11,7 @@
 #include "cacheSim.h"
 extern unsigned long SP;
 extern unsigned long long memAccessCycles;
+extern unsigned long long pipelineCycles;

 /***********************************************************
 Copyright 1992 by Stichting Mathematisch Centrum, Amsterdam, The
@@ -120,6 +121,7 @@ memAccessCycles += simDCache(0x4a8, 1);  // PC Relative Load
 memAccessCycles += simICache(0x36c, 44);
 // TODO: UnmappedLS: Load GlobalVar coder_1_state at line 247
 // TODO: UnmappedLS: Load GlobalVar coder_1_state at line 249
+pipelineCycles += 23;
  valpred = state->valprev;
  memAccessCycles += simDCache(state_addr, 1);
  index = state->index;
@@ -138,6 +140,7 @@ memAccessCycles += simDCache(0x4a8, 1);  // PC Relative Load
 memAccessCycles += simDCache((SP + outp_addr), 0);
 // Simulating I Cache for obj block 1
 memAccessCycles += simICache(0x398, 32);
+pipelineCycles += 15;
  outp =  outdata;
  memAccessCycles += simDCache(outdata_addr, 1);
  ivtmp_28 = 0;
@@ -147,6 +150,7 @@ memAccessCycles += simICache(0x398, 32);
 adpcm_coderbb_4:
 //  # PRED: 18 [91.0%]  (true,exec) 3 [100.0%]  (fallthru,exec)
 memAccessCycles += simDCache((SP + 0x4), 1);  // Reading Spilt Register
+pipelineCycles += 48;
  diff = (int) *(short int *)((uintptr_t)indata + (uintptr_t)ivtmp_28) - valpred;
  memAccessCycles += simDCache(indata_addr + (sizeof(short ) * (+ivtmp_28)), 1);
  if (diff < 0)
@@ -289,6 +293,7 @@ memAccessCycles += simICache(0x3b8, 200);

 adpcm_coderbb_19:
 //  # PRED: 18 [9.0%]  (false,exec)
+pipelineCycles += 10;
  if (bufferstep == 0)
    goto adpcm_coderbb_20;
  else
@@ -312,6 +317,7 @@ memAccessCycles += simDCache((SP + 0xc), 1);  // Reading Spilt Register
 memAccessCycles += simICache(0x490, 24);
 // TODO: UnmappedLS: Store GlobalVar coder_1_state at line 317
 // TODO: UnmappedLS: Store GlobalVar coder_1_state at line 318
+pipelineCycles += 19;
  state->valprev = (short int) (short int) valpred;
  memAccessCycles += simDCache(state_addr, 0);
  state->index = (char) (char) index;

--- a/instrument/examples/adpcm/instrumented/my_ctop_IR.c
+++ b/instrument/examples/adpcm/instrumented/my_ctop_IR.c
@@ -11,6 +11,7 @@
 #include "cacheSim.h"
 unsigned long SP = 0x1234;
 unsigned long long memAccessCycles = 0;
+unsigned long long pipelineCycles = 0;

 /*
 ** Timing - Test timing on adpcm coder and decoder.
@@ -69,6 +70,7 @@ memAccessCycles += simDCache(ARR_SIZE_addr, 1);
 memAccessCycles += simDCache((SP + ARR_SIZE_0_addr), 0);
 // Simulating I Cache for obj block 0
 memAccessCycles += simICache(0x200, 36);
+pipelineCycles += 27;
  ARR_SIZE_0 = ARR_SIZE;
  j = ARR_SIZE_0 / 10240;
  if (j != 0)
@@ -86,12 +88,14 @@ memAccessCycles += simDCache(0x364, 1);  // PC Relative Load
 memAccessCycles += simDCache(0x368, 1);  // PC Relative Load
 // Simulating I Cache for obj block 1
 memAccessCycles += simICache(0x224, 40);
+pipelineCycles += 21;
  end_43 = 0;
  count = 0;
 //  # SUCC: 3 [100.0%]  (fallthru)

 mainbb_3:
 //  # PRED: 13 [100.0%]  (fallthru) 14 [100.0%]  (fallthru)
+pipelineCycles += 9;
  end_46 = end_43 + 10240;
  if (end_43 < end_46)
    goto mainbb_4;
@@ -104,6 +108,7 @@ mainbb_4:
 memAccessCycles += simDCache((SP + 0x4), 1);  // Reading Spilt Register
 // Simulating I Cache for obj block 3
 memAccessCycles += simICache(0x258, 20);
+pipelineCycles += 13;
  i_45 = (int) end_43;
  ivtmp_34 = (uintptr_t)&in_Data[i_45];
  end_44 = end_43;
@@ -115,6 +120,7 @@ memAccessCycles += simDCache(pcmdata_addr + (2 * (end_44-end_43)), 0);
 // Simulating I Cache for obj block 4
 memAccessCycles += simICache(0x26c, 36);
 // TODO: UnmappedLS: Load GlobalVar in_Data at line 179
+pipelineCycles += 16;
  pcmdata[end_44 - end_43] = *(short int*)((uintptr_t)ivtmp_34);
  i_45 = i_45 + 1;
  end_44 = (long unsigned int) i_45;
@@ -129,6 +135,7 @@ mainbb_6:
 //  # PRED: 5 [1.0%]  (false,exec) 3 [1.0%]  (false,exec)
 // Simulating I Cache for obj block 5
 memAccessCycles += simICache(0x290, 40);
+pipelineCycles += 14;
  adpcm_coder (&pcmdata, pcmdata_addr,  &adpcmdata, adpcmdata_addr,  10240,  &coder_1_state, coder_1_state_addr);
  count = count + 1;
  if (j > count)
@@ -151,6 +158,7 @@ memAccessCycles += simDCache(0x358, 1);  // PC Relative Load
 memAccessCycles += simDCache((SP + ARR_SIZE_0_addr), 1);
 // Simulating I Cache for obj block 6
 memAccessCycles += simICache(0x2b8, 32);
+pipelineCycles += 19;
  if (ARR_SIZE_0 % 10240 != 0)
    goto mainbb_8;
  else
@@ -162,6 +170,7 @@ mainbb_8:
 memAccessCycles += simDCache(0x354, 1);  // PC Relative Load
 // Simulating I Cache for obj block 7
 memAccessCycles += simICache(0x2d8, 24);
+pipelineCycles += 14;
  start_40 = j * 10240;
  memAccessCycles += simDCache(ARR_SIZE_addr, 1);
  end = ARR_SIZE;
@@ -177,6 +186,7 @@ memAccessCycles += simDCache(0x35c, 1);  // PC Relative Load
 memAccessCycles += simDCache(0x360, 1);  // PC Relative Load
 // Simulating I Cache for obj block 8
 memAccessCycles += simICache(0x2f0, 28);
+pipelineCycles += 13;
  i = (int) start_40;
  ivtmp_28 = (uintptr_t)&in_Data[i];
  D_2229 = (int) end;
@@ -189,6 +199,7 @@ memAccessCycles += simDCache(pcmdata_addr + (2 * (start-start_40)), 0);
 // Simulating I Cache for obj block 9
 memAccessCycles += simICache(0x30c, 36);
 // TODO: UnmappedLS: Inaccurately Matched Load at line 219
+pipelineCycles += 16;
  pcmdata[start - start_40] = *(short int*)((uintptr_t)ivtmp_28);
  i = i + 1;
  start = (long unsigned int) i;
@@ -206,6 +217,7 @@ memAccessCycles += simDCache(0x364, 1);  // PC Relative Load
 memAccessCycles += simDCache(0x368, 1);  // PC Relative Load
 // Simulating I Cache for obj block 10
 memAccessCycles += simICache(0x330, 20);
+pipelineCycles += 11;
  adpcm_coder (&pcmdata, pcmdata_addr,  &adpcmdata, adpcmdata_addr,  (int) (end - start_40),  &coder_1_state, coder_1_state_addr);
 //  # SUCC: 12 [100.0%]  (fallthru,exec)

@@ -214,7 +226,9 @@ mainbb_12:
 // Simulating I Cache for obj block 11
 memAccessCycles += simICache(0x344, 16);
 printf("memAccessCycles = \%llu\n", memAccessCycles);
+printf("pipelineCycles = \%llu\n", pipelineCycles);
 cacheSimFini();
+pipelineCycles += 18;
  return 0;
 //  # SUCC: EXIT [100.0%] 


--- a/instrument/cacheSimInstrument.py
+++ b/instrument/cacheSimInstrument.py
@@ -2,13 +2,15 @@ import logging
 from optparse import OptionParser
 from subprocess import call
 import linecache as lc
+from collections import OrderedDict

 from load_store_info import *
 from match_cfg import match_cfg
 from gdb_info import *
 from cGrammar import parse_statement
 from irc_regex import *
-from collections import OrderedDict
+from pipeline_sim import *
+from annotation import *

 import re

@@ -21,16 +23,6 @@ def find(f, seq):
            return item
    return None
    
-class Annotation:
-    def __init__(self, annotation, fileName, lineNum, replace = False):
-        self.fileName = fileName
-        self.lineNum = lineNum
-        self.annotation = annotation
-        self.replace = replace
-        
-    def debug(self):
-        logging.debug("%s:%d: %s" % (self.fileName, self.lineNum, self.annotation))
-    
 def getListLocalVarInFunc(listLocalVariables, functionName):
    listLocalVarInFunc = []
    for localVar in listLocalVariables:
@@ -38,19 +30,8 @@ def getListLocalVarInFunc(listLocalVariables, functionName):
            listLocalVarInFunc.append(localVar)
    return listLocalVarInFunc

-def debugDictAnnot(dictAnnot):
-    for lineNum in dictAnnot.iterkeys():
-        for annot in dictAnnot[lineNum]:
-            annot.debug()
-
-def addAnnotationToDict(dict, lineNum, annot):
-    if lineNum not in dict:
-        dict[lineNum] = [annot]
-    else:
-        for a in dict[lineNum]:
-            if a.annotation == annot.annotation and a.fileName == annot.fileName:
-                return
-        dict[lineNum].append(annot)
+# TODO : Make a new function to instrument the additional global vars needed! 
+# def annotateGlobalVar(listISCFileNames):

 def annotateVarFuncDecl(listISCFileNames, listISCFunctions, listGlobalVariables, listLocalVariables):
    dictAnnotVarFuncDecl = {}
@@ -91,6 +72,11 @@ def annotateVarFuncDecl(listISCFileNames, listISCFunctions, listGlobalVariables,
                        addAnnotationToDict(dictAnnotVarFuncDecl,
                                            lineNum,
                                            annot)
+                        annot_str = "unsigned long long pipelineCycles = 0;"
+                        annot = Annotation(annot_str, ISCFileName, lineNum, False)
+                        addAnnotationToDict(dictAnnotVarFuncDecl,
+                                            lineNum,
+                                            annot)
                    else:
                        annot_str = "extern unsigned long SP;"
                        annot = Annotation(annot_str, ISCFileName, lineNum, False)
@@ -102,6 +88,11 @@ def annotateVarFuncDecl(listISCFileNames, listISCFunctions, listGlobalVariables,
                        addAnnotationToDict(dictAnnotVarFuncDecl,
                                            lineNum,
                                            annot)
+                        annot_str = "extern unsigned long long pipelineCycles;"
+                        annot = Annotation(annot_str, ISCFileName, lineNum, False)
+                        addAnnotationToDict(dictAnnotVarFuncDecl,
+                                            lineNum,
+                                            annot)
            
            if inMultiLineVarInit == 1:
                m = re_VarDeclInitMultiLineEnd.match(line)
@@ -303,6 +294,8 @@ def annotateVarFuncDecl(listISCFileNames, listISCFunctions, listGlobalVariables,
    debugDictAnnot(dictAnnotVarFuncDecl)
    return dictAnnotVarFuncDecl
    
+    # TODO : Annotate Push Pop Operations for DCache Access to Stack!
+    
 def annotateLoadStore(listISCFunctions, listObjdumpFunctions, listLSInfo, listGlobalVariables, listLocalVariables):
    dictAnnotLoadStore = {}
    
@@ -445,6 +438,9 @@ def annotateLoadStore(listISCFunctions, listObjdumpFunctions, listLSInfo, listGl
                    annot_str = 'printf("memAccessCycles = \%llu\\n", memAccessCycles);'
                    annot = Annotation(annot_str, funcISC.fileName, returnLineNumber-1, False)
                    addAnnotationToDict(dictAnnotLoadStore, returnLineNumber-1, annot)
+                    annot_str = 'printf("pipelineCycles = \%llu\\n", pipelineCycles);'
+                    annot = Annotation(annot_str, funcISC.fileName, returnLineNumber-1, False)
+                    addAnnotationToDict(dictAnnotLoadStore, returnLineNumber-1, annot)
                    annot_str = 'cacheSimFini();'
                    annot = Annotation(annot_str, funcISC.fileName, returnLineNumber-1, False)
                    addAnnotationToDict(dictAnnotLoadStore, returnLineNumber-1, annot)
@@ -553,7 +549,11 @@ def instrumentCache(listISCFileNames, listObjdumpFileNames, listBinaryFileNames,
    
    dictAnnotLoadStore = annotateLoadStore(listISCFunctions, listObjdumpFunctions, listLSInfo, listGlobalVariables, listLocalVariables)

+    dictAnnotPipeline = annot_pipeline_sim(listISCFunctions, listObjdumpFunctions)
+    debugDictAnnot(dictAnnotPipeline)
+
    dictAnnot = unionDict(dictAnnotVarFuncDecl, dictAnnotLoadStore)
+    dictAnnot = unionDict(dictAnnot, dictAnnotPipeline)

    generateAnnotatedSourceFiles(dictAnnot, listISCFileNames, insOutputPath)


--- a/instrument/pipeline_sim.py
+++ b/instrument/pipeline_sim.py
+import linecache as lc
+from arm_isa_regex import *
+from annotation import *
+
+ALU_LAT = 1
+MUL_LAT = 1
+LDST_LAT = 1
+
+ALU_RES_LAT = 1 - ALU_LAT
+MUL_RES_LAT = 4 - MUL_LAT
+LDST_RES_LAT = 3 - LDST_LAT
+
+def find(f, seq):
+    """Return first item in sequence where f(item) == True."""
+    for item in seq:
+        if f(item): 
+            return item
+
+def annot_pipeline_sim(listISCFunctions, 
+                          listObjdumpFunctions):
+    '''
+    In this function, we simulate pipeline for each basic block in the Objdump.
+    
+    We assume that :
+     * Each basic block is independent, and is cold started ie. no instruction
+       midway in pipeline.
+     * For each Load/Store Instruction L1 Data Hit occurs.
+     
+    Pipeline Structure
+     * 8 stage pipeline
+       * 2 Instruction Fetch Stages
+       * 2 Instruction Decode Stages
+       * 4 Parallel Stages for 
+         * Arithmetic Operations :  SH,   ALU,  SAT,  WB
+         * Multiply Operations :    MAC1, MAC2, MAC3
+         * Load Store Unit :        ADD,  DC1,  DC2,  WB
+         
+    Definitions:
+     * Result Latency : Number of cycles required for the result of this
+       instruction to be available at the start of ALU, MAC2 or DC1 stages
+       of the next instruction.
+     * Early Reg : Register required at the start of SH, MAC1 or ADD stages. 
+       One cycle must be added to result latency of instruction producing this
+       register for interlock calculations.
+     * Late Reg : Register required in second stage of execution pipeline. One
+       cycle must be subtracted from result latency of instruction producing
+       this register for interlock calculations.
+     
+     Load/Store Instructions
+      * Result Latency : 3 cycles
+     ADD/MOV Inst:
+      * Result Latency : 1 cycle
+     MUL Inst
+      * Result Latency : avg. 4 cycles (varies)
+    '''
+    dictAnnotPipeline = {}
+    
+    for funcObj in listObjdumpFunctions:
+        funcISC = find(lambda fn: fn.functionName == funcObj.functionName, 
+                       listISCFunctions) 
+        
+        for blockObj in funcObj.cfg.listBlocks:
+            #initialize some state registers
+            prevOpLoadStore = False
+            prevDestReg = None
+            currBlockCycles = 7; # For filling the pipeline on cold start
+            
+            for lineNumObj in range(blockObj.startLine, blockObj.endLine + 1):
+                lineObj = lc.getline(funcObj.fileName, lineNumObj)
+                
+                # Initialize some state Registers
+                opcode = ""
+                destReg = ""
+                op1Reg = ""
+                op2 = ""
+                op2RegIsShifted = False
+                
+                m = re_instruction.match(lineObj)
+                assert(m is not None)
+                instObj = m.group("instruction")
+                
+                m = re_arithInst.match(instObj)
+                if m is not None:
+                    opcode = m.group("arithOpcode")
+                    destReg = m.group("destReg")
+                    op1Reg = m.group("op1Reg")
+                    if m.group("op2RegShifted") is not None:
+                        op2 = m.group("op2RegShifted")
+                        op2RegIsShifted = True
+                    elif m.group("op2Reg") is not None:
+                        op2 = m.group("op2Reg")
+                        op2RegIsShifted = False
+                    else:
+                        assert(m.group("op2ImedVal") is not None)
+                        op2 = ""
+                        op2RegIsShifted = False
+                    
+                    if opcode not in ["mul", "mla"]:
+                        # Add Instruction
+                        currBlockCycles= currBlockCycles + ALU_LAT
+                        prevResLat = ALU_RES_LAT
+                    else:
+                        # Multiply Instruction
+                        currBlockCycles = currBlockCycles + MUL_LAT
+                        prevResLat = MUL_RES_LAT
+                    
+                    # Calculation Interlock for Add and Mul Instructions
+                    if prevDestReg is not None:
+                        if op2 is not "":
+                            if (op2RegIsShifted == True):
+                                if (op2 == prevDestReg):
+                                    # Early Reg!
+                                    currBlockCycles = currBlockCycles + prevResLat + 1
+                                elif (op1Reg == prevDestReg):
+                                    # Late Reg!
+                                    currBlockCycles = currBlockCycles + prevResLat - 1
+                                else:
+                                    # No interlock
+                                    pass
+                            else: # (op2RegIsShifted == False)
+                                if (op1Reg == prevDestReg or
+                                    op2 == prevDestReg):
+                                    # Early Reg!
+                                    currBlockCycles = currBlockCycles + prevResLat + 1
+                                else:
+                                    # No interlock
+                                    pass
+                    
+                    prevDestReg = destReg
+                    continue
+
+                m = re_movInst.match(instObj)
+                if m is not None:
+                    destReg = m.group("destReg")
+                    if m.group("op2RegShifted") is not None:
+                        op2 = m.group("op2RegShifted")
+                        op2RegIsShifted = True
+                    elif m.group("op2Reg") is not None:
+                        op2 = m.group("op2Reg")
+                        op2RegIsShifted = False
+                    else:
+                        assert(m.group("op2ImedVal") is not None)
+                        op2 = ""
+                        op2RegIsShifted = False
+                    
+                    currBlockCycles= currBlockCycles + ALU_LAT
+                    
+                    # Calculation Interlock for Add and Mul Instructions
+                    if prevDestReg is not None:
+                        if op2 is not "":
+                            if (op2 == prevDestReg):
+                                # Early Reg!
+                                currBlockCycles = currBlockCycles + prevResLat + 1
+                            else:
+                                # No interlock
+                                pass
+                        
+                    prevDestReg = destReg
+                    prevResLat = ALU_RES_LAT
+                    continue
+            
+                m = re_mvnInst.match(instObj)
+                if m is not None:
+                    destReg = m.group("destReg")
+                    if m.group("op2RegShifted") is not None:
+                        op2 = m.group("op2RegShifted")
+                        op2RegIsShifted = True
+                    elif m.group("op2Reg") is not None:
+                        op2 = m.group("op2Reg")
+                        op2RegIsShifted = False
+                    else:
+                        assert(m.group("op2ImedVal") is not None)
+                        op2 = ""
+                        op2RegIsShifted = False
+                    
+                    currBlockCycles= currBlockCycles + ALU_LAT
+                    
+                    # Calculation Interlock for Add and Mul Instructions
+                    if prevDestReg is not None:
+                        if op2 is not "":
+                            if (op2 == prevDestReg):
+                                # Early Reg!
+                                currBlockCycles = currBlockCycles + prevResLat + 1
+                            else:
+                                # No interlock
+                                pass
+                        
+                    prevDestReg = destReg
+                    prevResLat = ALU_RES_LAT
+                    continue
+
+                m = re_arithLongInst.match(instObj)
+                if m is not None:
+                    # Long Arithmetic Instructions
+                    currBlockCycles = currBlockCycles + 2 * MUL_RES_LAT
+                    prevDestReg = None
+                    prevResLat = 0
+                    # TODO: This needs to be improved!
+                    continue
+                
+                m = re_logicInst.match(instObj)
+                if m is not None:
+                    # Logical Instruction
+                    # opcode = m.group("logicOpcode")
+                    destReg = m.group("destReg")
+                    op1Reg = m.group("op1Reg")
+                    if m.group("op2RegShifted") is not None:
+                        op2 = m.group("op2RegShifted")
+                        op2RegIsShifted = True
+                    elif m.group("op2Reg") is not None:
+                        op2 = m.group("op2Reg")
+                        op2RegIsShifted = False
+                    else:
+                        assert(m.group("op2ImedVal") is not None)
+                        op2 = ""
+                        op2RegIsShifted = False
+                        
+                    currBlockCycles = currBlockCycles + ALU_RES_LAT
+                    
+                    # Calculating Interlock
+                    if prevDestReg is not None:
+                        if op2 is not "":
+                            if (op2RegIsShifted == True):
+                                if (op2 == prevDestReg):
+                                    # Early Reg!
+                                    currBlockCycles = currBlockCycles + prevResLat + 1
+                                elif (op1Reg == prevDestReg):
+                                    # Late Reg!
+                                    currBlockCycles = currBlockCycles + prevResLat - 1
+                                else:
+                                    # No interlock
+                                    pass
+                            else: # (op2RegIsShifted == False)
+                                if (op1Reg == prevDestReg or
+                                    op2 == prevDestReg):
+                                    # Early Reg!
+                                    currBlockCycles = currBlockCycles + prevResLat + 1
+                                else:
+                                    # No interlock
+                                    pass
+                    
+                    prevDestReg = destReg
+                    prevResLat = ALU_RES_LAT
+                    continue
+                
+                # Calculating Interlock
+                m = re_shiftInst.match(instObj)
+                if m is not None:
+                    # Shift Instruction
+                    # opcode = m.group("shiftOpcode")
+                    destReg = m.group("destReg")
+                    op1Reg = m.group("op1Reg")
+                        
+                    currBlockCycles = currBlockCycles + ALU_RES_LAT    
+                    
+                    # Calculation Interlock
+                    if prevDestReg is not None:
+                        if prevDestReg == op1Reg:
+                            # Early Reg!
+                            currBlockCycles = currBlockCycles + prevResLat + 1
+                        else:
+                            # No Interlock!
+                            pass
+                    
+                    prevDestReg = destReg
+                    prevResLat = ALU_RES_LAT
+                    continue
+                
+                m = re_branchInst.match(instObj)
+                if m is not None:
+                    # Branch Instruction
+                    currBlockCycles = currBlockCycles + ALU_RES_LAT
+                    prevDestReg = None
+                    prevResLat = 0
+                    # TODO: May need to be improved! 
+                    continue
+                
+                m = re_cmpInst.match(instObj)
+                if m is not None:
+                    # Compare Instruction
+                    op1Reg = m.group("op1Reg")
+                    if m.group("op2RegShifted") is not None:
+                        op2 = m.group("op2RegShifted")
+                        op2RegIsShifted = True
+                    elif m.group("op2Reg") is not None:
+                        op2 = m.group("op2Reg")
+                        op2RegIsShifted = False
+                    else:
+                        assert(m.group("op2ImedVal") is not None)
+                        op2 = ""
+                        op2RegIsShifted = False
+                        
+                    currBlockCycles = currBlockCycles + ALU_RES_LAT
+                    
+                    # Calculating Interlock
+                    if prevDestReg is not None:
+                        if op2 is not "":
+                            if (op2RegIsShifted == True):
+                                if (op2 == prevDestReg):
+                                    # Early Reg!
+                                    currBlockCycles = currBlockCycles + prevResLat + 1
+                                elif (op1Reg == prevDestReg):
+                                    # Late Reg!
+                                    currBlockCycles = currBlockCycles + prevResLat - 1
+                                else:
+                                    # No interlock
+                                    pass
+                            else: # (op2RegIsShifted == False)
+                                if (op1Reg == prevDestReg or
+                                    op2 == prevDestReg):
+                                    # Early Reg!
+                                    currBlockCycles = currBlockCycles + prevResLat + 1
+                                else:
+                                    # No interlock
+                                    pass
+                    
+                    prevDestReg = None
+                    prevResLat = 0
+                    continue
+                
+                m = re_pushInst.match(instObj)
+                if m is not None:
+                    pushRegs = m.group("pushRegs")
+                    listPushRegs = pushRegs.split(",")
+                    currBlockCycles = currBlockCycles + len(listPushRegs)
+                    prevDestReg = None
+                    prevResLat = 0
+                    # TODO: May need to be fixed!
+                    continue
+                
+                m = re_popInst.match(instObj)
+                if m is not None:
+                    pushRegs = m.group("popRegs")
+                    listPushRegs = pushRegs.split(",")
+                    currBlockCycles = currBlockCycles + len(listPushRegs)
+                    prevDestReg = None
+                    prevResLat = 0
+                    # TODO: May need to be fixed!
+                    continue
+                
+                m = re_ignoredInst.match(instObj)
+                if m is not None:
+                    currBlockCycles = currBlockCycles + LDST_LAT
+                    prevDestReg = None
+                    prevResLat = 0
+                    # TODO: Has to be improved!!!
+                    continue
+                
+                m = re_loadInst.match(instObj)
+                if m is not None:
+                    destReg = m.group("destReg")
+                    for baseRegLabel in ["am2_1BaseReg", 
+                                 "am2_2BaseReg", 
+                                 "am2_3BaseReg", 
+                                 "am2_4BaseReg", 
+                                 "am2_5BaseReg", 
+                                 "am2_6BaseReg", 
+                                 "am2_7BaseReg"]:
+                        if m.group(baseRegLabel) is not None:
+                            break
+                    op1Reg = m.group(baseRegLabel)
+                    op2 = ""
+                    if op1Reg == "am2_3BaseReg":
+                        op2 = m.group("am2_3OffsetReg")
+                        op2RegIsShifted = False
+                    elif op1Reg == "am2_4BaseReg":
+                        op2 = m.group("am2_4OffsetReg")
+                        op2RegIsShifted = True
+                    elif op1Reg == "am2_6BaseReg":
+                        op2 = m.group("am2_6OffsetReg")
+                        op2RegIsShifted = False
+                    elif op1Reg == "am2_7BaseReg":
+                        op2 = m.group("am2_7OffsetReg")
+                        op2RegIsShifted = True
+                                                            
+                    currBlockCycles = currBlockCycles + LDST_LAT
+                    
+                    if prevDestReg is not None:
+                        if op2 is not "":
+                            if (op2RegIsShifted == True):
+                                if (op2 == prevDestReg):
+                                    # Early Reg!
+                                    currBlockCycles = currBlockCycles + prevResLat + 1
+                                elif (op1Reg == prevDestReg):
+                                    # Late Reg!
+                                    currBlockCycles = currBlockCycles + prevResLat - 1
+                                else:
+                                    # No interlock
+                                    pass
+                            else: # (op2RegIsShifted == False)
+                                if (op1Reg == prevDestReg or
+                                    op2 == prevDestReg):
+                                    # Early Reg!
+                                    currBlockCycles = currBlockCycles + prevResLat + 1
+                                else:
+                                    # No interlock
+                                    pass
+
+                    prevDestReg = destReg
+                    prevResLat = LDST_RES_LAT
+                    continue
+                
+                m = re_storeInst.match(instObj)
+                if m is not None:
+                    currBlockCycles = currBlockCycles + LDST_LAT
+                    prevDestReg = None
+                    prevResLat = 0
+                    continue
+                
+                print "%d : Instruction Could not be identified!" % (lineNumObj)
+                
+            # Block Done!
+            blockIndISC = blockObj.mapsTo[0]
+            blockISC = funcISC.cfg.listBlocks[blockIndISC]
+            annot_str = "pipelineCycles += %d;" % (currBlockCycles)
+            annot = Annotation(annot_str,
+                               funcISC.fileName,
+                               blockISC.startLine,
+                               replace = False)
+            print ("Adding annotation to %s:%d : %s" % (funcISC.fileName,
+                                                        blockISC.startLine-1,
+                                                        annot_str))
+            addAnnotationToDict(dictAnnotPipeline, 
+                                blockISC.startLine-1,
+                                annot)
+        # Function Done!
+    
+    # All Functions Done!
+    return dictAnnotPipeline
+
+if __name__ == "__main__":
+    pass
\ No newline at end of file