2018
|
R S Kannan, A Jain, M A Laurenzano, L Tang, J Mars Proctor: Detecting and Investigating Interference in Shared Datacenters Inproceedings 2018 IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS), pp. 76-86, 2018. Abstract | Links | BibTeX @inproceedings{8366937,
title = {Proctor: Detecting and Investigating Interference in Shared Datacenters},
author = {R S Kannan and A Jain and M A Laurenzano and L Tang and J Mars},
doi = {10.1109/ISPASS.2018.00016},
year = {2018},
date = {2018-04-01},
booktitle = {2018 IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS)},
pages = {76-86},
abstract = {Cloud-scale datacenter management systems utilize virtualization to provide performance isolation while maximizing the utilization of the underlying hardware infrastructure. However, virtualization does not provide complete performance isolation as Virtual Machines (VMs) still compete for nonreservable shared resources (like caches, network, I/O bandwidth etc.) This becomes highly challenging to address in datacenter environments housing tens of thousands of VMs, causing degradation in application performance. Addressing this problem for production datacenters requires a non-intrusive scalable solution that 1) detects performance intrusion and 2) investigates both the intrusive VMs causing interference, as well as the resource(s) for which the VMs are competing for. To address this problem, this paper introduces Proctor, a real time, lightweight and scalable analytics fabric that detects performance intrusive VMs and identifies its root causes from among the arbitrary VMs running in shared datacenters across 4 key hardware resources - network, I/O, cache, and CPU. Proctor is based on a robust statistical approach that requires no special profiling phases, standing in stark contrast to a wide body of prior work that assumes pre-acquisition of application level information prior to its execution. By detecting performance degradation and identifying the root cause VMs and their metrics, Proctor can be utilized to dramatically improve the performance outcomes of applications executing in large-scale datacenters. From our experiments, we are able to show that when we deploy Proctor in a datacenter housing a mix of I/O, network, compute and cache-sensitive applications, it is able to effectively pinpoint performance intrusive VMs. Further, we observe that when Proctor is applied with migration, the application-level Quality-of-Service improves by an average of 2.2× as compared to systems which are unable to detect, identify and pinpoint performance int- usion and their root causes.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Cloud-scale datacenter management systems utilize virtualization to provide performance isolation while maximizing the utilization of the underlying hardware infrastructure. However, virtualization does not provide complete performance isolation as Virtual Machines (VMs) still compete for nonreservable shared resources (like caches, network, I/O bandwidth etc.) This becomes highly challenging to address in datacenter environments housing tens of thousands of VMs, causing degradation in application performance. Addressing this problem for production datacenters requires a non-intrusive scalable solution that 1) detects performance intrusion and 2) investigates both the intrusive VMs causing interference, as well as the resource(s) for which the VMs are competing for. To address this problem, this paper introduces Proctor, a real time, lightweight and scalable analytics fabric that detects performance intrusive VMs and identifies its root causes from among the arbitrary VMs running in shared datacenters across 4 key hardware resources - network, I/O, cache, and CPU. Proctor is based on a robust statistical approach that requires no special profiling phases, standing in stark contrast to a wide body of prior work that assumes pre-acquisition of application level information prior to its execution. By detecting performance degradation and identifying the root cause VMs and their metrics, Proctor can be utilized to dramatically improve the performance outcomes of applications executing in large-scale datacenters. From our experiments, we are able to show that when we deploy Proctor in a datacenter housing a mix of I/O, network, compute and cache-sensitive applications, it is able to effectively pinpoint performance intrusive VMs. Further, we observe that when Proctor is applied with migration, the application-level Quality-of-Service improves by an average of 2.2× as compared to systems which are unable to detect, identify and pinpoint performance int- usion and their root causes. |
Chang-Hong Hsu, Qingyuan Deng, Jason Mars, Lingjia Tang SmoothOperator: Reducing Power Fragmentation and Improving Power Utilization in Large-scale Datacenters Inproceedings Proceedings of the Twenty-Third International Conference on Architectural Support for Programming Languages and Operating Systems, pp. 535–548, ACM, Williamsburg, VA, USA, 2018, ISBN: 978-1-4503-4911-6. Links | BibTeX @inproceedings{Hsu:2018:SRP:3173162.3173190,
title = {SmoothOperator: Reducing Power Fragmentation and Improving Power Utilization in Large-scale Datacenters},
author = {Chang-Hong Hsu and Qingyuan Deng and Jason Mars and Lingjia Tang},
url = {http://doi.acm.org/10.1145/3173162.3173190},
doi = {10.1145/3173162.3173190},
isbn = {978-1-4503-4911-6},
year = {2018},
date = {2018-01-01},
booktitle = {Proceedings of the Twenty-Third International Conference on Architectural Support for Programming Languages and Operating Systems},
pages = {535--548},
publisher = {ACM},
address = {Williamsburg, VA, USA},
series = {ASPLOS '18},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Shih-Chieh Lin, Yunqi Zhang, Chang-Hong Hsu, Matt Skach, Md E Haque, Lingjia Tang, Jason Mars The Architectural Implications of Autonomous Driving: Constraints and Acceleration Inproceedings Proceedings of the Twenty-Third International Conference on Architectural Support for Programming Languages and Operating Systems, pp. 751–766, ACM, Williamsburg, VA, USA, 2018, ISBN: 978-1-4503-4911-6. Links | BibTeX @inproceedings{Lin:2018:AIA:3173162.3173191,
title = {The Architectural Implications of Autonomous Driving: Constraints and Acceleration},
author = {Shih-Chieh Lin and Yunqi Zhang and Chang-Hong Hsu and Matt Skach and Md E Haque and Lingjia Tang and Jason Mars},
url = {http://doi.acm.org/10.1145/3173162.3173191},
doi = {10.1145/3173162.3173191},
isbn = {978-1-4503-4911-6},
year = {2018},
date = {2018-01-01},
booktitle = {Proceedings of the Twenty-Third International Conference on Architectural Support for Programming Languages and Operating Systems},
pages = {751--766},
publisher = {ACM},
address = {Williamsburg, VA, USA},
series = {ASPLOS '18},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Matt Skach, Manish Arora, Dean Tullsen, Lingjia Tang, Jason Mars Virtual Melting Temperature: Managing Server Load to Minimize Cooling Overhead with Phase Change Materials Inproceedings Proceedings of the 45th Annual International Symposium on Computer Architecture, ACM, 2018. BibTeX @inproceedings{Skach:vmt,
title = {Virtual Melting Temperature: Managing Server Load to Minimize Cooling Overhead with Phase Change Materials},
author = {Matt Skach and Manish Arora and Dean Tullsen and Lingjia Tang and Jason Mars},
year = {2018},
date = {2018-01-01},
booktitle = {Proceedings of the 45th Annual International Symposium on Computer Architecture},
publisher = {ACM},
series = {ISCA '18},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Animesh Jain, Amar Phanishayee, Jason Mars, Lingjia Tang, Gennady Pekhimenko Gist: Efficient Data Encoding for Deep Neural Network Training Inproceedings Proceedings of the 45th Annual International Symposium on Computer Architecture, ACM, 2018. BibTeX @inproceedings{Jian:gist,
title = {Gist: Efficient Data Encoding for Deep Neural Network Training},
author = {Animesh Jain and Amar Phanishayee and Jason Mars and Lingjia Tang and Gennady Pekhimenko},
year = {2018},
date = {2018-01-01},
booktitle = {Proceedings of the 45th Annual International Symposium on Computer Architecture},
publisher = {ACM},
series = {ISCA '18},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Yiping Kang, Yunqi Zhang, Jonathan K Kummerfeld, Lingjia Tang, Jason Mars Data Collection for a Production Dialogue System: A Clinc Perspective Inproceedings Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 3 (Industry Papers), pp. 33–40, 2018. BibTeX @inproceedings{kang2018data,
title = {Data Collection for a Production Dialogue System: A Clinc Perspective},
author = {Yiping Kang and Yunqi Zhang and Jonathan K Kummerfeld and Lingjia Tang and Jason Mars},
year = {2018},
date = {2018-01-01},
booktitle = {Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 3 (Industry Papers)},
volume = {3},
pages = {33--40},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Animesh Jian, Michael Laurenzano, Gilles Pokam, Jason Mars, Lingjia Tang Architectural Support for Convolutional Neural Networks on Modern CPUs Inproceedings Proceedings of the International Conference on Parallel Architectures and Compilation, ACM, New York, NY, USA, 2018. BibTeX @inproceedings{Jian:2018,
title = {Architectural Support for Convolutional Neural Networks on Modern CPUs},
author = {Animesh Jian and Michael Laurenzano and Gilles Pokam and Jason Mars and Lingjia Tang},
year = {2018},
date = {2018-01-01},
booktitle = {Proceedings of the International Conference on Parallel Architectures and Compilation},
publisher = {ACM},
address = {New York, NY, USA},
series = {PACT '18},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Shih-Chieh Lin, Chang-Hong Hsu, Walter Talamonti, Yunqi Zhang, Steve Oney, Lingjia Tang, Jason Mars Adasa: A Conversational In-Vehicle Digital Assistant for Advanced Driver Assistance Features Inproceedings Proceedings of the Annual Symposium on User Interface Software and Technology, ACM, New York, NY, USA, 2018. BibTeX @inproceedings{Lin:2018b,
title = {Adasa: A Conversational In-Vehicle Digital Assistant for Advanced Driver Assistance Features},
author = {Shih-Chieh Lin and Chang-Hong Hsu and Walter Talamonti and Yunqi Zhang and Steve Oney and Lingjia Tang and Jason Mars},
year = {2018},
date = {2018-01-01},
booktitle = {Proceedings of the Annual Symposium on User Interface Software and Technology},
publisher = {ACM},
address = {New York, NY, USA},
series = {UIST '18},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Jason Mars, Michael Laurenzano, Lingjia Tang Runtime compiler environment with dynamic co-located code execution Miscellaneous 2018, (US Patent 9,921,859). BibTeX @misc{mars2018runtime,
title = {Runtime compiler environment with dynamic co-located code execution},
author = {Jason Mars and Michael Laurenzano and Lingjia Tang},
year = {2018},
date = {2018-00-01},
publisher = {Google Patents},
note = {US Patent 9,921,859},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
|
2017
|
Parker Hill, Animesh Jain, Mason Hill, Babak Zamirai, Chang-Hong Hsu, Michael A Laurenzano, Scott Mahlke, Lingjia Tang, Jason Mars DeftNN: Addressing Bottlenecks for DNN Execution on GPUs via Synapse Vector Elimination and Near-compute Data Fission Inproceedings Proceedings of the 50th Annual IEEE/ACM International Symposium on Microarchitecture, pp. 786–799, ACM, Cambridge, Massachusetts, 2017, ISBN: 978-1-4503-4952-9. Links | BibTeX @inproceedings{Hill:2017:DAB:3123939.3123970,
title = {DeftNN: Addressing Bottlenecks for DNN Execution on GPUs via Synapse Vector Elimination and Near-compute Data Fission},
author = {Parker Hill and Animesh Jain and Mason Hill and Babak Zamirai and Chang-Hong Hsu and Michael A Laurenzano and Scott Mahlke and Lingjia Tang and Jason Mars},
url = {http://doi.acm.org/10.1145/3123939.3123970},
doi = {10.1145/3123939.3123970},
isbn = {978-1-4503-4952-9},
year = {2017},
date = {2017-01-01},
booktitle = {Proceedings of the 50th Annual IEEE/ACM International Symposium on Microarchitecture},
pages = {786--799},
publisher = {ACM},
address = {Cambridge, Massachusetts},
series = {MICRO-50 '17},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Hailong Yang, Quan Chen, Moeiz Riaz, Zhongzhi Luan, Lingjia Tang, Jason Mars PowerChief: Intelligent Power Allocation for Multi-Stage Applications to Improve Responsiveness on Power Constrained CMP Inproceedings Proceedings of the 44th Annual International Symposium on Computer Architecture, pp. 133–146, ACM, Toronto, ON, Canada, 2017, ISBN: 978-1-4503-4892-8. Links | BibTeX @inproceedings{Yang:2017:PIP:3079856.3080224,
title = {PowerChief: Intelligent Power Allocation for Multi-Stage Applications to Improve Responsiveness on Power Constrained CMP},
author = {Hailong Yang and Quan Chen and Moeiz Riaz and Zhongzhi Luan and Lingjia Tang and Jason Mars},
url = {http://doi.acm.org/10.1145/3079856.3080224},
doi = {10.1145/3079856.3080224},
isbn = {978-1-4503-4892-8},
year = {2017},
date = {2017-01-01},
booktitle = {Proceedings of the 44th Annual International Symposium on Computer Architecture},
pages = {133--146},
publisher = {ACM},
address = {Toronto, ON, Canada},
series = {ISCA '17},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Yiping Kang, Johann Hauswald, Cao Gao, Austin Rovinski, Trevor Mudge, Jason Mars, Lingjia Tang Neurosurgeon: Collaborative Intelligence Between the Cloud and Mobile Edge Inproceedings Proceedings of the Twenty-Second International Conference on Architectural Support for Programming Languages and Operating Systems, pp. 615–629, ACM, Xián, China, 2017, ISBN: 978-1-4503-4465-4. Links | BibTeX @inproceedings{Kang:2017:NCI:3037697.3037698,
title = {Neurosurgeon: Collaborative Intelligence Between the Cloud and Mobile Edge},
author = {Yiping Kang and Johann Hauswald and Cao Gao and Austin Rovinski and Trevor Mudge and Jason Mars and Lingjia Tang},
url = {http://doi.acm.org/10.1145/3037697.3037698},
doi = {10.1145/3037697.3037698},
isbn = {978-1-4503-4465-4},
year = {2017},
date = {2017-01-01},
booktitle = {Proceedings of the Twenty-Second International Conference on Architectural Support for Programming Languages and Operating Systems},
pages = {615--629},
publisher = {ACM},
address = {Xián, China},
series = {ASPLOS '17},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Quan Chen, Hailong Yang, Minyi Guo, Ram Srivatsa Kannan, Jason Mars, Lingjia Tang Prophet: Precise QoS Prediction on Non-Preemptive Accelerators to Improve Utilization in Warehouse-Scale Computers Inproceedings Proceedings of the Twenty-Second International Conference on Architectural Support for Programming Languages and Operating Systems, pp. 17–32, ACM, Xián, China, 2017, ISBN: 978-1-4503-4465-4. Links | BibTeX @inproceedings{Chen:2017:PPQ:3037697.3037700,
title = {Prophet: Precise QoS Prediction on Non-Preemptive Accelerators to Improve Utilization in Warehouse-Scale Computers},
author = {Quan Chen and Hailong Yang and Minyi Guo and Ram Srivatsa Kannan and Jason Mars and Lingjia Tang},
url = {http://doi.acm.org/10.1145/3037697.3037700},
doi = {10.1145/3037697.3037700},
isbn = {978-1-4503-4465-4},
year = {2017},
date = {2017-01-01},
booktitle = {Proceedings of the Twenty-Second International Conference on Architectural Support for Programming Languages and Operating Systems},
pages = {17--32},
publisher = {ACM},
address = {Xián, China},
series = {ASPLOS '17},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Chang-Hong Hsu, Yunqi Zhang, Michael A Laurenzano, David Meisner, Thomas Wenisch, Ronald G Dreslinski, Jason Mars, Lingjia Tang Reining in Long Tails in Warehouse-Scale Computers with Quick Voltage Boosting Using Adrenaline Journal Article ACM Trans. Comput. Syst., 35 (1), pp. 2:1–2:33, 2017, ISSN: 0734-2071. Links | BibTeX @article{Hsu:2017:RLT:3067095.3054742,
title = {Reining in Long Tails in Warehouse-Scale Computers with Quick Voltage Boosting Using Adrenaline},
author = {Chang-Hong Hsu and Yunqi Zhang and Michael A Laurenzano and David Meisner and Thomas Wenisch and Ronald G Dreslinski and Jason Mars and Lingjia Tang},
url = {http://doi.acm.org/10.1145/3054742},
doi = {10.1145/3054742},
issn = {0734-2071},
year = {2017},
date = {2017-01-01},
journal = {ACM Trans. Comput. Syst.},
volume = {35},
number = {1},
pages = {2:1--2:33},
publisher = {ACM},
address = {New York, NY, USA},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
|
M Skach, M Arora, C H Hsu, Q Li, D Tullsen, L Tang, J Mars Thermal Time Shifting: Decreasing Data Center Cooling Costs with Phase-Change Materials Journal Article IEEE Internet Computing, 21 (4), pp. 34-43, 2017, ISSN: 1089-7801. Abstract | Links | BibTeX @article{7994565,
title = {Thermal Time Shifting: Decreasing Data Center Cooling Costs with Phase-Change Materials},
author = {M Skach and M Arora and C H Hsu and Q Li and D Tullsen and L Tang and J Mars},
doi = {10.1109/MIC.2017.2911418},
issn = {1089-7801},
year = {2017},
date = {2017-01-01},
journal = {IEEE Internet Computing},
volume = {21},
number = {4},
pages = {34-43},
abstract = {As data centers increase in size and computational capacity, their growth comes at a cost: an increasing thermal load that must be removed to prevent overheating. Here, the authors propose using phase-change materials (PCMs) to shape a data center's thermal load, absorbing and releasing heat when it's advantageous. They evaluate three important opportunities for cost savings. They find that in a data center, PCM can reduce the necessary cooling system size by up to 12 percent without impacting peak throughput, or increase the number of servers by up to 14.6 percent without increasing the cooling load. In a thermally constrained setting, PCM can increase peak throughput up to 69 percent while delaying the onset of thermal limits by over 3 hours, and a wax-aware scheduler enables up to a 11 percent reduction in peak cooling load when batch jobs are added, increasing average daily throughput by 36-52 percent.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
As data centers increase in size and computational capacity, their growth comes at a cost: an increasing thermal load that must be removed to prevent overheating. Here, the authors propose using phase-change materials (PCMs) to shape a data center's thermal load, absorbing and releasing heat when it's advantageous. They evaluate three important opportunities for cost savings. They find that in a data center, PCM can reduce the necessary cooling system size by up to 12 percent without impacting peak throughput, or increase the number of servers by up to 14.6 percent without increasing the cooling load. In a thermally constrained setting, PCM can increase peak throughput up to 69 percent while delaying the onset of thermal limits by over 3 hours, and a wax-aware scheduler enables up to a 11 percent reduction in peak cooling load when batch jobs are added, increasing average daily throughput by 36-52 percent. |
Robert Hundt, Lingjia Tang, Jason Mars Allocation of tasks in large scale computing systems Miscellaneous 2017, (US Patent 9,563,532). BibTeX @misc{hundt2017allocation,
title = {Allocation of tasks in large scale computing systems},
author = {Robert Hundt and Lingjia Tang and Jason Mars},
year = {2017},
date = {2017-01-01},
note = {US Patent 9,563,532},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
|
Jason Mars, Michael Laurenzano, Lingjia Tang Runtime Compiler Environment With Dynamic Co-Located Code Execution Miscellaneous 2017, (US Patent App. 15/428,917). BibTeX @misc{mars2017runtime,
title = {Runtime Compiler Environment With Dynamic Co-Located Code Execution},
author = {Jason Mars and Michael Laurenzano and Lingjia Tang},
year = {2017},
date = {2017-00-01},
note = {US Patent App. 15/428,917},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
|
2016
|
S Zekany, D Rings, N Harada, M A Laurenzano, L Tang, J Mars CrystalBall: Statically analyzing runtime behavior via deep sequence learning Inproceedings 2016 49th Annual IEEE/ACM International Symposium on Microarchitecture (MICRO), pp. 1-12, 2016. Abstract | Links | BibTeX @inproceedings{7783727,
title = {CrystalBall: Statically analyzing runtime behavior via deep sequence learning},
author = {S Zekany and D Rings and N Harada and M A Laurenzano and L Tang and J Mars},
doi = {10.1109/MICRO.2016.7783727},
year = {2016},
date = {2016-10-01},
booktitle = {2016 49th Annual IEEE/ACM International Symposium on Microarchitecture (MICRO)},
pages = {1-12},
abstract = {Understanding dynamic program behavior is critical in many stages of the software development lifecycle, for purposes as diverse as optimization, debugging, testing, and security. This paper focuses on the problem of predicting dynamic program behavior statically. We introduce a novel technique to statically identify hot paths that leverages emerging deep learning techniques to take advantage of their ability to learn subtle, complex relationships between sequences of inputs. This approach maps well to the problem of identifying the behavior of sequences of basic blocks in program execution. Our technique is also designed to operate on the compiler's intermediate representation (IR), as opposed to the approaches taken by prior techniques that have focused primarily on source code, giving our approach language-independence. We describe the pitfalls of conventional metrics used for hot path prediction such as accuracy, and motivate the use of Area Under the Receiver Operating Characteristic curve (AUROC). Through a thorough evaluation of our technique on complex applications that include the SPEC CPU2006 benchmarks, we show that our approach achieves an AUROC of 0.85.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Understanding dynamic program behavior is critical in many stages of the software development lifecycle, for purposes as diverse as optimization, debugging, testing, and security. This paper focuses on the problem of predicting dynamic program behavior statically. We introduce a novel technique to statically identify hot paths that leverages emerging deep learning techniques to take advantage of their ability to learn subtle, complex relationships between sequences of inputs. This approach maps well to the problem of identifying the behavior of sequences of basic blocks in program execution. Our technique is also designed to operate on the compiler's intermediate representation (IR), as opposed to the approaches taken by prior techniques that have focused primarily on source code, giving our approach language-independence. We describe the pitfalls of conventional metrics used for hot path prediction such as accuracy, and motivate the use of Area Under the Receiver Operating Characteristic curve (AUROC). Through a thorough evaluation of our technique on complex applications that include the SPEC CPU2006 benchmarks, we show that our approach achieves an AUROC of 0.85. |
A Jain, P Hill, S C Lin, M Khan, M E Haque, M A Laurenzano, S Mahlke, L Tang, J Mars Concise loads and stores: The case for an asymmetric compute-memory architecture for approximation Inproceedings 2016 49th Annual IEEE/ACM International Symposium on Microarchitecture (MICRO), pp. 1-13, 2016. Abstract | Links | BibTeX @inproceedings{7783744,
title = {Concise loads and stores: The case for an asymmetric compute-memory architecture for approximation},
author = {A Jain and P Hill and S C Lin and M Khan and M E Haque and M A Laurenzano and S Mahlke and L Tang and J Mars},
doi = {10.1109/MICRO.2016.7783744},
year = {2016},
date = {2016-10-01},
booktitle = {2016 49th Annual IEEE/ACM International Symposium on Microarchitecture (MICRO)},
pages = {1-13},
abstract = {Cache capacity and memory bandwidth play critical roles in application performance, particularly for data-intensive applications from domains that include machine learning, numerical analysis, and data mining. Many of these applications are also tolerant to imprecise inputs and have loose constraints on the quality of output, making them ideal candidates for approximate computing. This paper introduces a novel approximate computing technique that decouples the format of data in the memory hierarchy from the format of data in the compute subsystem to significantly reduce the cost of storing and moving bits throughout the memory hierarchy and improve application performance. This asymmetric compute-memory extension to conventional architectures, ACME, adds two new instruction classes to the ISA - load-concise and store-concise - along with three small functional units to the micro-architecture to support these instructions. ACME does not affect exact execution of applications and comes into play only when concise memory operations are used. Through detailed experimentation we find that ACME is very effective at trading result accuracy for improved application performance. Our results show that ACME achieves a 1.3x speedup (up to 1.8x) while maintaining 99% accuracy, or a 1.1x speedup while maintaining 99.999% accuracy. Moreover, our approach incurs negligible area and power overheads, adding just 0.005% area and 0.1% power to a conventional modern architecture.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Cache capacity and memory bandwidth play critical roles in application performance, particularly for data-intensive applications from domains that include machine learning, numerical analysis, and data mining. Many of these applications are also tolerant to imprecise inputs and have loose constraints on the quality of output, making them ideal candidates for approximate computing. This paper introduces a novel approximate computing technique that decouples the format of data in the memory hierarchy from the format of data in the compute subsystem to significantly reduce the cost of storing and moving bits throughout the memory hierarchy and improve application performance. This asymmetric compute-memory extension to conventional architectures, ACME, adds two new instruction classes to the ISA - load-concise and store-concise - along with three small functional units to the micro-architecture to support these instructions. ACME does not affect exact execution of applications and comes into play only when concise memory operations are used. Through detailed experimentation we find that ACME is very effective at trading result accuracy for improved application performance. Our results show that ACME achieves a 1.3x speedup (up to 1.8x) while maintaining 99% accuracy, or a 1.1x speedup while maintaining 99.999% accuracy. Moreover, our approach incurs negligible area and power overheads, adding just 0.005% area and 0.1% power to a conventional modern architecture. |
A Jain, M A Laurenzano, L Tang, J Mars Continuous shape shifting: Enabling loop co-optimization via near-free dynamic code rewriting Inproceedings 2016 49th Annual IEEE/ACM International Symposium on Microarchitecture (MICRO), pp. 1-12, 2016. Abstract | Links | BibTeX @inproceedings{7783726,
title = {Continuous shape shifting: Enabling loop co-optimization via near-free dynamic code rewriting},
author = {A Jain and M A Laurenzano and L Tang and J Mars},
doi = {10.1109/MICRO.2016.7783726},
year = {2016},
date = {2016-10-01},
booktitle = {2016 49th Annual IEEE/ACM International Symposium on Microarchitecture (MICRO)},
pages = {1-12},
abstract = {The class of optimizations characterized by manipulating a loop's interaction space for improved cache locality and reuse (i.e, cache tiling/blocking/strip mine and interchange) are static optimizations requiring a priori information about the microarchitectural and runtime environment of an application binary. However, particularly in datacenter environments, deployed applications face numerous dynamic environments over their lifetimes. As a result, this class of optimizations can result in sub-optimal performance due to the inability to flexibly adapt iteration spaces as cache conditions change at runtime. This paper introduces continuous shape shifiting, a compilation approach that removes the risks of cache tiling optimizations by dynamically rewriting (and reshaping) deployed, running application code. To realize continuous shape shifting, we present ShapeShifter, a framework for continuous monitoring of co-running applications and their runtime environments to reshape loop iteration spaces and pinpoint near-optimal loop tile configurations. Upon identifying a need for reshaping, a new tiling approach is quickly constructed for the application, new code is dynamically generated and is then seamlessly stitched into the running application with near-zero overhead. Our evaluation on a wide spectrum of runtime scenarios demonstrates that ShapeShifter achieves an average of 10-40% performance improvement (up to 2.4 χ) on real systems depending on the runtime environment compared to an oracle static loop tiling baseline.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
The class of optimizations characterized by manipulating a loop's interaction space for improved cache locality and reuse (i.e, cache tiling/blocking/strip mine and interchange) are static optimizations requiring a priori information about the microarchitectural and runtime environment of an application binary. However, particularly in datacenter environments, deployed applications face numerous dynamic environments over their lifetimes. As a result, this class of optimizations can result in sub-optimal performance due to the inability to flexibly adapt iteration spaces as cache conditions change at runtime. This paper introduces continuous shape shifiting, a compilation approach that removes the risks of cache tiling optimizations by dynamically rewriting (and reshaping) deployed, running application code. To realize continuous shape shifting, we present ShapeShifter, a framework for continuous monitoring of co-running applications and their runtime environments to reshape loop iteration spaces and pinpoint near-optimal loop tile configurations. Upon identifying a need for reshaping, a new tiling approach is quickly constructed for the application, new code is dynamically generated and is then seamlessly stitched into the running application with near-zero overhead. Our evaluation on a wide spectrum of runtime scenarios demonstrates that ShapeShifter achieves an average of 10-40% performance improvement (up to 2.4 χ) on real systems depending on the runtime environment compared to an oracle static loop tiling baseline. |
J Hauswald, M A Laurenzano, Y Zhang, C Li, A Rovinski, A Khurana, R G Dreslinski, T Mudge, V Petrucci, L Tang, J Mars Sirius Implications for Future Warehouse-Scale Computers Journal Article IEEE Micro, 36 (3), pp. 42-53, 2016, ISSN: 0272-1732. Abstract | Links | BibTeX @article{7478443,
title = {Sirius Implications for Future Warehouse-Scale Computers},
author = {J Hauswald and M A Laurenzano and Y Zhang and C Li and A Rovinski and A Khurana and R G Dreslinski and T Mudge and V Petrucci and L Tang and J Mars},
doi = {10.1109/MM.2016.37},
issn = {0272-1732},
year = {2016},
date = {2016-05-01},
journal = {IEEE Micro},
volume = {36},
number = {3},
pages = {42-53},
abstract = {Demand is expected to grow significantly for cloud services that deliver sophisticated artificial intelligence on the critical path of user queries, as is the case with intelligent personal assistants such as Apple's Siri. If the prediction of the trend is correct, these types of applications will likely consume most of the world's computing cycles. The Sirius project was motivated to investigate what this future might look like and how cloud architectures should evolve to achieve it.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Demand is expected to grow significantly for cloud services that deliver sophisticated artificial intelligence on the critical path of user queries, as is the case with intelligent personal assistants such as Apple's Siri. If the prediction of the trend is correct, these types of applications will likely consume most of the world's computing cycles. The Sirius project was motivated to investigate what this future might look like and how cloud architectures should evolve to achieve it. |
Johann Hauswald, Michael A Laurenzano, Yunqi Zhang, Hailong Yang, Yiping Kang, Cheng Li, Austin Rovinski, Arjun Khurana, Ronald G Dreslinski, Trevor Mudge, Vinicius Petrucci, Lingjia Tang, Jason Mars Designing Future Warehouse-Scale Computers for Sirius, an End-to-End Voice and Vision Personal Assistant Journal Article ACM Trans. Comput. Syst., 34 (1), pp. 2:1–2:32, 2016, ISSN: 0734-2071. Links | BibTeX @article{Hauswald:2016:DFW:2912578.2870631,
title = {Designing Future Warehouse-Scale Computers for Sirius, an End-to-End Voice and Vision Personal Assistant},
author = {Johann Hauswald and Michael A Laurenzano and Yunqi Zhang and Hailong Yang and Yiping Kang and Cheng Li and Austin Rovinski and Arjun Khurana and Ronald G Dreslinski and Trevor Mudge and Vinicius Petrucci and Lingjia Tang and Jason Mars},
url = {http://doi.acm.org/10.1145/2870631},
doi = {10.1145/2870631},
issn = {0734-2071},
year = {2016},
date = {2016-04-01},
journal = {ACM Trans. Comput. Syst.},
volume = {34},
number = {1},
pages = {2:1--2:32},
publisher = {ACM},
address = {New York, NY, USA},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
|
Michael A Laurenzano, Yunqi Zhang, Jiang Chen, Lingjia Tang, Jason Mars PowerChop: Identifying and Managing Non-critical Units in Hybrid Processor Architectures Inproceedings Proceedings of the 43rd International Symposium on Computer Architecture, pp. 140–152, IEEE Press, Seoul, Republic of Korea, 2016, ISBN: 978-1-4673-8947-1. Links | BibTeX @inproceedings{Laurenzano:2016:PIM:3001136.3001152,
title = {PowerChop: Identifying and Managing Non-critical Units in Hybrid Processor Architectures},
author = {Michael A Laurenzano and Yunqi Zhang and Jiang Chen and Lingjia Tang and Jason Mars},
url = {https://doi.org/10.1109/ISCA.2016.22},
doi = {10.1109/ISCA.2016.22},
isbn = {978-1-4673-8947-1},
year = {2016},
date = {2016-01-01},
booktitle = {Proceedings of the 43rd International Symposium on Computer Architecture},
pages = {140--152},
publisher = {IEEE Press},
address = {Seoul, Republic of Korea},
series = {ISCA '16},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Yunqi Zhang, David Meisner, Jason Mars, Lingjia Tang Treadmill: Attributing the Source of Tail Latency Through Precise Load Testing and Statistical Inference Inproceedings Proceedings of the 43rd International Symposium on Computer Architecture, pp. 456–468, IEEE Press, Seoul, Republic of Korea, 2016, ISBN: 978-1-4673-8947-1. Links | BibTeX @inproceedings{Zhang:2016:TAS:3001136.3001186,
title = {Treadmill: Attributing the Source of Tail Latency Through Precise Load Testing and Statistical Inference},
author = {Yunqi Zhang and David Meisner and Jason Mars and Lingjia Tang},
url = {https://doi.org/10.1109/ISCA.2016.47},
doi = {10.1109/ISCA.2016.47},
isbn = {978-1-4673-8947-1},
year = {2016},
date = {2016-01-01},
booktitle = {Proceedings of the 43rd International Symposium on Computer Architecture},
pages = {456--468},
publisher = {IEEE Press},
address = {Seoul, Republic of Korea},
series = {ISCA '16},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Michael A Laurenzano, Parker Hill, Mehrzad Samadi, Scott Mahlke, Jason Mars, Lingjia Tang Input Responsiveness: Using Canary Inputs to Dynamically Steer Approximation Inproceedings Proceedings of the 37th ACM SIGPLAN Conference on Programming Language Design and Implementation, pp. 161–176, ACM, Santa Barbara, CA, USA, 2016, ISBN: 978-1-4503-4261-2. Links | BibTeX @inproceedings{Laurenzano:2016:IRU:2908080.2908087,
title = {Input Responsiveness: Using Canary Inputs to Dynamically Steer Approximation},
author = {Michael A Laurenzano and Parker Hill and Mehrzad Samadi and Scott Mahlke and Jason Mars and Lingjia Tang},
url = {http://doi.acm.org/10.1145/2908080.2908087},
doi = {10.1145/2908080.2908087},
isbn = {978-1-4503-4261-2},
year = {2016},
date = {2016-01-01},
booktitle = {Proceedings of the 37th ACM SIGPLAN Conference on Programming Language Design and Implementation},
pages = {161--176},
publisher = {ACM},
address = {Santa Barbara, CA, USA},
series = {PLDI '16},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Quan Chen, Hailong Yang, Jason Mars, Lingjia Tang Baymax: QoS Awareness and Increased Utilization for Non-Preemptive Accelerators in Warehouse Scale Computers Inproceedings Proceedings of the Twenty-First International Conference on Architectural Support for Programming Languages and Operating Systems, pp. 681–696, ACM, Atlanta, Georgia, USA, 2016, ISBN: 978-1-4503-4091-5. Links | BibTeX @inproceedings{Chen:2016:BQA:2872362.2872368,
title = {Baymax: QoS Awareness and Increased Utilization for Non-Preemptive Accelerators in Warehouse Scale Computers},
author = {Quan Chen and Hailong Yang and Jason Mars and Lingjia Tang},
url = {http://doi.acm.org/10.1145/2872362.2872368},
doi = {10.1145/2872362.2872368},
isbn = {978-1-4503-4091-5},
year = {2016},
date = {2016-01-01},
booktitle = {Proceedings of the Twenty-First International Conference on Architectural Support for Programming Languages and Operating Systems},
pages = {681--696},
publisher = {ACM},
address = {Atlanta, Georgia, USA},
series = {ASPLOS '16},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Lingjia Tang, Jason Mars, Robert Hundt System and methods for sharing memory subsystem resources among datacenter applications Miscellaneous 2016, (US Patent 9,401,869). BibTeX @misc{tang2016system,
title = {System and methods for sharing memory subsystem resources among datacenter applications},
author = {Lingjia Tang and Jason Mars and Robert Hundt},
year = {2016},
date = {2016-00-01},
note = {US Patent 9,401,869},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
|
Jason Mars, Robert Hundt, Neil A Vachharajani Cache contention management on a multicore processor based on the degree of contention exceeding a threshold Miscellaneous 2016, (US Patent 9,268,542). BibTeX @misc{mars2016cache,
title = {Cache contention management on a multicore processor based on the degree of contention exceeding a threshold},
author = {Jason Mars and Robert Hundt and Neil A Vachharajani},
year = {2016},
date = {2016-00-01},
note = {US Patent 9,268,542},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
|
2015
|
M Khan, M A Laurenzanoy, J Marsy, E Hagersten, D Black-Schaffer AREP: Adaptive Resource Efficient Prefetching for Maximizing Multicore Performance Inproceedings 2015 International Conference on Parallel Architecture and Compilation (PACT), pp. 367-378, 2015, ISSN: 1089-795X. Abstract | Links | BibTeX @inproceedings{7429320,
title = {AREP: Adaptive Resource Efficient Prefetching for Maximizing Multicore Performance},
author = {M Khan and M A Laurenzanoy and J Marsy and E Hagersten and D Black-Schaffer},
doi = {10.1109/PACT.2015.35},
issn = {1089-795X},
year = {2015},
date = {2015-10-01},
booktitle = {2015 International Conference on Parallel Architecture and Compilation (PACT)},
pages = {367-378},
abstract = {Modern processors widely use hardware prefetching to hide memory latency. While aggressive hardware prefetchers can improve performance significantly for some applications, they can limit the overall performance in highly-utilized multicore processors by saturating the offchip bandwidth and wasting last-level cache capacity. Co-executing applications can slowdown due to contention over these shared resources. This work introduces Adaptive Resource Efficient Prefetching (AREP) -- a runtime framework that dynamically combines software prefetching and hardware prefetching to maximize throughput in highly utilized multicore processors. AREP achieves better performance by prefetching data in a resource efficient way -- conserving offchip-bandwidth and last-level cache capacity with accurate prefetching and by applying cache-bypassing when possible. AREP dynamically explores a mix of hardware/software prefetching policies, then selects and applies the best performing policy. AREP is phase-aware and re-explores (at runtime) for the best prefetching policy at phase boundaries. A multitude of experiments with workload mixes and parallel applications on a modern high performance multicore show that AREP can increase throughput by up to 49% (8.1% on average). This is complemented by improved fairness, resulting in average quality of service above 94%.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Modern processors widely use hardware prefetching to hide memory latency. While aggressive hardware prefetchers can improve performance significantly for some applications, they can limit the overall performance in highly-utilized multicore processors by saturating the offchip bandwidth and wasting last-level cache capacity. Co-executing applications can slowdown due to contention over these shared resources. This work introduces Adaptive Resource Efficient Prefetching (AREP) -- a runtime framework that dynamically combines software prefetching and hardware prefetching to maximize throughput in highly utilized multicore processors. AREP achieves better performance by prefetching data in a resource efficient way -- conserving offchip-bandwidth and last-level cache capacity with accurate prefetching and by applying cache-bypassing when possible. AREP dynamically explores a mix of hardware/software prefetching policies, then selects and applies the best performing policy. AREP is phase-aware and re-explores (at runtime) for the best prefetching policy at phase boundaries. A multitude of experiments with workload mixes and parallel applications on a modern high performance multicore show that AREP can increase throughput by up to 49% (8.1% on average). This is complemented by improved fairness, resulting in average quality of service above 94%. |
V Petrucci, M A Laurenzano, J Doherty, Y Zhang, D Mossé, J Mars, L Tang Octopus-Man: QoS-driven task management for heterogeneous multicores in warehouse-scale computers Inproceedings 2015 IEEE 21st International Symposium on High Performance Computer Architecture (HPCA), pp. 246-258, 2015, ISSN: 1530-0897. Abstract | Links | BibTeX @inproceedings{7056037,
title = {Octopus-Man: QoS-driven task management for heterogeneous multicores in warehouse-scale computers},
author = {V Petrucci and M A Laurenzano and J Doherty and Y Zhang and D Mossé and J Mars and L Tang},
doi = {10.1109/HPCA.2015.7056037},
issn = {1530-0897},
year = {2015},
date = {2015-02-01},
booktitle = {2015 IEEE 21st International Symposium on High Performance Computer Architecture (HPCA)},
pages = {246-258},
abstract = {Heterogeneous multicore architectures have the potential to improve energy efficiency by integrating power-efficient wimpy cores with high-performing brawny cores. However, it is an open question as how to deliver energy reduction while ensuring the quality of service (QoS) of latency-sensitive web-services running on such heterogeneous multicores in warehouse-scale computers (WSCs). In this work, we first investigate the implications of heterogeneous multicores in WSCs and show that directly adopting heterogeneous multicores without re-designing the software stack to provide QoS management leads to significant QoS violations. We then present Octopus-Man, a novel QoS-aware task management solution that dynamically maps latency-sensitive tasks to the least power-hungry processing resources that are sufficient to meet the QoS requirements. Using carefully-designed feedback-control mechanisms, Octopus-Man addresses critical challenges that emerge due to uncertainties in workload fluctuations and adaptation dynamics in a real system. Our evaluation using web-search and memcached running on a real-system Intel heterogeneous prototype demonstrates that Octopus-Man improves energy efficiency by up to 41% (CPU power) and up to 15% (system power) over an all-brawny WSC design while adhering to specified QoS targets.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Heterogeneous multicore architectures have the potential to improve energy efficiency by integrating power-efficient wimpy cores with high-performing brawny cores. However, it is an open question as how to deliver energy reduction while ensuring the quality of service (QoS) of latency-sensitive web-services running on such heterogeneous multicores in warehouse-scale computers (WSCs). In this work, we first investigate the implications of heterogeneous multicores in WSCs and show that directly adopting heterogeneous multicores without re-designing the software stack to provide QoS management leads to significant QoS violations. We then present Octopus-Man, a novel QoS-aware task management solution that dynamically maps latency-sensitive tasks to the least power-hungry processing resources that are sufficient to meet the QoS requirements. Using carefully-designed feedback-control mechanisms, Octopus-Man addresses critical challenges that emerge due to uncertainties in workload fluctuations and adaptation dynamics in a real system. Our evaluation using web-search and memcached running on a real-system Intel heterogeneous prototype demonstrates that Octopus-Man improves energy efficiency by up to 41% (CPU power) and up to 15% (system power) over an all-brawny WSC design while adhering to specified QoS targets. |
C H Hsu, Y Zhang, M A Laurenzano, D Meisner, T Wenisch, J Mars, L Tang, R G Dreslinski Adrenaline: Pinpointing and reining in tail queries with quick voltage boosting Inproceedings 2015 IEEE 21st International Symposium on High Performance Computer Architecture (HPCA), pp. 271-282, 2015, ISSN: 1530-0897. Abstract | Links | BibTeX @inproceedings{7056039,
title = {Adrenaline: Pinpointing and reining in tail queries with quick voltage boosting},
author = {C H Hsu and Y Zhang and M A Laurenzano and D Meisner and T Wenisch and J Mars and L Tang and R G Dreslinski},
doi = {10.1109/HPCA.2015.7056039},
issn = {1530-0897},
year = {2015},
date = {2015-02-01},
booktitle = {2015 IEEE 21st International Symposium on High Performance Computer Architecture (HPCA)},
pages = {271-282},
abstract = {Reducing the long tail of the query latency distribution in modern warehouse scale computers is critical for improving performance and quality of service of workloads such as Web Search and Memcached. Traditional turbo boost increases a processor's voltage and frequency during a coarse-grain sliding window, boosting all queries that are processed during that window. However, the inability of such a technique to pinpoint tail queries for boosting limits its tail reduction benefit. In this work, we propose Adrenaline, an approach to leverage finer granularity, 10's of nanoseconds, voltage boosting to effectively rein in the tail latency with query-level precision. Two key insights underlie this work. First, emerging finer granularity voltage/frequency boosting is an enabling mechanism for intelligent allocation of the power budget to precisely boost only the queries that contribute to the tail latency; and second, per-query characteristics can be used to design indicators for proactively pinpointing these queries, triggering boosting accordingly. Based on these insights, Adrenaline effectively pinpoints and boosts queries that are likely to increase the tail distribution and can reap more benefit from the voltage/frequency boost. By evaluating under various workload configurations, we demonstrate the effectiveness of our methodology. We achieve up to a 2.50x tail latency improvement for Memcached and up to a 3.03x for Web Search over coarse-grained DVFS given a fixed boosting power budget. When optimizing for energy reduction, Adrenaline achieves up to a 1.81x improvement for Memcached and up to a 1.99x for Web Search over coarse-grained DVFS.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Reducing the long tail of the query latency distribution in modern warehouse scale computers is critical for improving performance and quality of service of workloads such as Web Search and Memcached. Traditional turbo boost increases a processor's voltage and frequency during a coarse-grain sliding window, boosting all queries that are processed during that window. However, the inability of such a technique to pinpoint tail queries for boosting limits its tail reduction benefit. In this work, we propose Adrenaline, an approach to leverage finer granularity, 10's of nanoseconds, voltage boosting to effectively rein in the tail latency with query-level precision. Two key insights underlie this work. First, emerging finer granularity voltage/frequency boosting is an enabling mechanism for intelligent allocation of the power budget to precisely boost only the queries that contribute to the tail latency; and second, per-query characteristics can be used to design indicators for proactively pinpointing these queries, triggering boosting accordingly. Based on these insights, Adrenaline effectively pinpoints and boosts queries that are likely to increase the tail distribution and can reap more benefit from the voltage/frequency boost. By evaluating under various workload configurations, we demonstrate the effectiveness of our methodology. We achieve up to a 2.50x tail latency improvement for Memcached and up to a 3.03x for Web Search over coarse-grained DVFS given a fixed boosting power budget. When optimizing for energy reduction, Adrenaline achieves up to a 1.81x improvement for Memcached and up to a 1.99x for Web Search over coarse-grained DVFS. |
Johann Hauswald, Yiping Kang, Michael A Laurenzano, Quan Chen, Cheng Li, Trevor Mudge, Ronald G Dreslinski, Jason Mars, Lingjia Tang DjiNN and Tonic: DNN As a Service and Its Implications for Future Warehouse Scale Computers Inproceedings Proceedings of the 42Nd Annual International Symposium on Computer Architecture, pp. 27–40, ACM, Portland, Oregon, 2015, ISBN: 978-1-4503-3402-0. Links | BibTeX @inproceedings{Hauswald:2015:DTD:2749469.2749472,
title = {DjiNN and Tonic: DNN As a Service and Its Implications for Future Warehouse Scale Computers},
author = {Johann Hauswald and Yiping Kang and Michael A Laurenzano and Quan Chen and Cheng Li and Trevor Mudge and Ronald G Dreslinski and Jason Mars and Lingjia Tang},
url = {http://doi.acm.org/10.1145/2749469.2749472},
doi = {10.1145/2749469.2749472},
isbn = {978-1-4503-3402-0},
year = {2015},
date = {2015-01-01},
booktitle = {Proceedings of the 42Nd Annual International Symposium on Computer Architecture},
pages = {27--40},
publisher = {ACM},
address = {Portland, Oregon},
series = {ISCA '15},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Matt Skach, Manish Arora, Chang-Hong Hsu, Qi Li, Dean Tullsen, Lingjia Tang, Jason Mars Thermal Time Shifting: Leveraging Phase Change Materials to Reduce Cooling Costs in Warehouse-scale Computers Inproceedings Proceedings of the 42Nd Annual International Symposium on Computer Architecture, pp. 439–449, ACM, Portland, Oregon, 2015, ISBN: 978-1-4503-3402-0. Links | BibTeX @inproceedings{Skach:2015:TTS:2749469.2749474,
title = {Thermal Time Shifting: Leveraging Phase Change Materials to Reduce Cooling Costs in Warehouse-scale Computers},
author = {Matt Skach and Manish Arora and Chang-Hong Hsu and Qi Li and Dean Tullsen and Lingjia Tang and Jason Mars},
url = {http://doi.acm.org/10.1145/2749469.2749474},
doi = {10.1145/2749469.2749474},
isbn = {978-1-4503-3402-0},
year = {2015},
date = {2015-01-01},
booktitle = {Proceedings of the 42Nd Annual International Symposium on Computer Architecture},
pages = {439--449},
publisher = {ACM},
address = {Portland, Oregon},
series = {ISCA '15},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Johann Hauswald, Michael A Laurenzano, Yunqi Zhang, Cheng Li, Austin Rovinski, Arjun Khurana, Ronald G Dreslinski, Trevor Mudge, Vinicius Petrucci, Lingjia Tang, Jason Mars Sirius: An Open End-to-End Voice and Vision Personal Assistant and Its Implications for Future Warehouse Scale Computers Inproceedings Proceedings of the Twentieth International Conference on Architectural Support for Programming Languages and Operating Systems, pp. 223–238, ACM, Istanbul, Turkey, 2015, ISBN: 978-1-4503-2835-7. Links | BibTeX @inproceedings{Hauswald:2015:SOE:2694344.2694347,
title = {Sirius: An Open End-to-End Voice and Vision Personal Assistant and Its Implications for Future Warehouse Scale Computers},
author = {Johann Hauswald and Michael A Laurenzano and Yunqi Zhang and Cheng Li and Austin Rovinski and Arjun Khurana and Ronald G Dreslinski and Trevor Mudge and Vinicius Petrucci and Lingjia Tang and Jason Mars},
url = {http://doi.acm.org/10.1145/2694344.2694347},
doi = {10.1145/2694344.2694347},
isbn = {978-1-4503-2835-7},
year = {2015},
date = {2015-01-01},
booktitle = {Proceedings of the Twentieth International Conference on Architectural Support for Programming Languages and Operating Systems},
pages = {223--238},
publisher = {ACM},
address = {Istanbul, Turkey},
series = {ASPLOS '15},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
2014
|
Alex D Breslow, Ananta Tiwari, Martin Schulz, Laura Carrington, Lingjia Tang, Jason Mars Enabling Fair Pricing on High Performance Computer Systems with Node Sharing Journal Article Sci. Program., 22 (2), pp. 59–74, 2014, ISSN: 1058-9244. Links | BibTeX @article{Breslow:2014:EFP:3184296.3184298,
title = {Enabling Fair Pricing on High Performance Computer Systems with Node Sharing},
author = {Alex D Breslow and Ananta Tiwari and Martin Schulz and Laura Carrington and Lingjia Tang and Jason Mars},
url = {http://dl.acm.org/citation.cfm?id=3184296.3184298},
issn = {1058-9244},
year = {2014},
date = {2014-04-01},
journal = {Sci. Program.},
volume = {22},
number = {2},
pages = {59--74},
publisher = {IOS Press},
address = {Amsterdam, The Netherlands, The Netherlands},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
|
Michael A Laurenzano, Yunqi Zhang, Lingjia Tang, Jason Mars Protean Code: Achieving Near-Free Online Code Transformations for Warehouse Scale Computers Inproceedings Proceedings of the 47th Annual IEEE/ACM International Symposium on Microarchitecture, pp. 558–570, IEEE Computer Society, Cambridge, United Kingdom, 2014, ISBN: 978-1-4799-6998-2. Links | BibTeX @inproceedings{Laurenzano:2014:PCA:2742155.2742212,
title = {Protean Code: Achieving Near-Free Online Code Transformations for Warehouse Scale Computers},
author = {Michael A Laurenzano and Yunqi Zhang and Lingjia Tang and Jason Mars},
url = {http://dx.doi.org/10.1109/MICRO.2014.21},
doi = {10.1109/MICRO.2014.21},
isbn = {978-1-4799-6998-2},
year = {2014},
date = {2014-01-01},
booktitle = {Proceedings of the 47th Annual IEEE/ACM International Symposium on Microarchitecture},
pages = {558--570},
publisher = {IEEE Computer Society},
address = {Cambridge, United Kingdom},
series = {MICRO-47},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Yunqi Zhang, Michael A Laurenzano, Jason Mars, Lingjia Tang SMiTe: Precise QoS Prediction on Real-System SMT Processors to Improve Utilization in Warehouse Scale Computers Inproceedings Proceedings of the 47th Annual IEEE/ACM International Symposium on Microarchitecture, pp. 406–418, IEEE Computer Society, Cambridge, United Kingdom, 2014, ISBN: 978-1-4799-6998-2. Links | BibTeX @inproceedings{Zhang:2014:SPQ:2742155.2742197,
title = {SMiTe: Precise QoS Prediction on Real-System SMT Processors to Improve Utilization in Warehouse Scale Computers},
author = {Yunqi Zhang and Michael A Laurenzano and Jason Mars and Lingjia Tang},
url = {http://dx.doi.org/10.1109/MICRO.2014.53},
doi = {10.1109/MICRO.2014.53},
isbn = {978-1-4799-6998-2},
year = {2014},
date = {2014-01-01},
booktitle = {Proceedings of the 47th Annual IEEE/ACM International Symposium on Microarchitecture},
pages = {406--418},
publisher = {IEEE Computer Society},
address = {Cambridge, United Kingdom},
series = {MICRO-47},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Yan Zhai, Xiao Zhang, Stephane Eranian, Lingjia Tang, Jason Mars HaPPy: Hyperthread-aware Power Profiling Dynamically Inproceedings Proceedings of the 2014 USENIX Conference on USENIX Annual Technical Conference, pp. 211–218, USENIX Association, Philadelphia, PA, 2014, ISBN: 978-1-931971-10-2. Links | BibTeX @inproceedings{Zhai:2014:HHP:2643634.2643657,
title = {HaPPy: Hyperthread-aware Power Profiling Dynamically},
author = {Yan Zhai and Xiao Zhang and Stephane Eranian and Lingjia Tang and Jason Mars},
url = {http://dl.acm.org/citation.cfm?id=2643634.2643657},
isbn = {978-1-931971-10-2},
year = {2014},
date = {2014-01-01},
booktitle = {Proceedings of the 2014 USENIX Conference on USENIX Annual Technical Conference},
pages = {211--218},
publisher = {USENIX Association},
address = {Philadelphia, PA},
series = {USENIX ATC'14},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
2013
|
A D Breslow, A Tiwari, M Schulz, L Carrington, L Tang, J Mars Enabling fair pricing on HPC systems with node sharing Inproceedings 2013 SC - International Conference for High Performance Computing, Networking, Storage and Analysis (SC), pp. 1-12, 2013, ISSN: 2167-4329. Abstract | Links | BibTeX @inproceedings{6877470,
title = {Enabling fair pricing on HPC systems with node sharing},
author = {A D Breslow and A Tiwari and M Schulz and L Carrington and L Tang and J Mars},
doi = {10.1145/2503210.2503256},
issn = {2167-4329},
year = {2013},
date = {2013-11-01},
booktitle = {2013 SC - International Conference for High Performance Computing, Networking, Storage and Analysis (SC)},
pages = {1-12},
abstract = {Co-location, where multiple jobs share compute nodes in large-scale HPC systems, has been shown to increase aggregate throughput and energy efficiency by 10 to 20%. However, system operators disallow co-location due to fair-pricing concerns, i.e., a pricing mechanism that considers performance interference from co-running jobs. In the current pricing model, application execution time determines the price, which results in unfair prices paid by the minority of users whose jobs suffer from co-location. This paper presents POPPA, a runtime system that enables fair pricing by delivering precise online interference detection and facilitates the adoption of supercomputers with co-locations. POPPA leverages a novel shutter mechanism - a cyclic, fine-grained interference sampling mechanism to accurately deduce the interference between co-runners - to provide unbiased pricing of jobs that share nodes. POPPA is able to quantify inter-application interference within 4% mean absolute error on a variety of co-located benchmark and real scientific workloads.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Co-location, where multiple jobs share compute nodes in large-scale HPC systems, has been shown to increase aggregate throughput and energy efficiency by 10 to 20%. However, system operators disallow co-location due to fair-pricing concerns, i.e., a pricing mechanism that considers performance interference from co-running jobs. In the current pricing model, application execution time determines the price, which results in unfair prices paid by the minority of users whose jobs suffer from co-location. This paper presents POPPA, a runtime system that enables fair pricing by delivering precise online interference detection and facilitates the adoption of supercomputers with co-locations. POPPA leverages a novel shutter mechanism - a cyclic, fine-grained interference sampling mechanism to accurately deduce the interference between co-runners - to provide unbiased pricing of jobs that share nodes. POPPA is able to quantify inter-application interference within 4% mean absolute error on a variety of co-located benchmark and real scientific workloads. |
Alex D Breslow, Ananta Tiwari, Martin Schulz, Laura Carrington, Lingjia Tang, Jason Mars Enabling Fair Pricing on HPC Systems with Node Sharing Inproceedings Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis, pp. 37:1–37:12, ACM, Denver, Colorado, 2013, ISBN: 978-1-4503-2378-9. Links | BibTeX @inproceedings{Breslow:2013:EFP:2503210.2503256,
title = {Enabling Fair Pricing on HPC Systems with Node Sharing},
author = {Alex D Breslow and Ananta Tiwari and Martin Schulz and Laura Carrington and Lingjia Tang and Jason Mars},
url = {http://doi.acm.org/10.1145/2503210.2503256},
doi = {10.1145/2503210.2503256},
isbn = {978-1-4503-2378-9},
year = {2013},
date = {2013-01-01},
booktitle = {Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis},
pages = {37:1--37:12},
publisher = {ACM},
address = {Denver, Colorado},
series = {SC '13},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Hailong Yang, Qi Zhao, Zhongzhi Luan, Depei Qian, Ming Xie, Jason Mars, Lingjia Tang iMeter: An Integrated VM Power Model Based on Performance Profiling Inproceedings Proceedings of the 2013 42Nd International Conference on Parallel Processing, pp. 359–368, IEEE Computer Society, Washington, DC, USA, 2013, ISBN: 978-0-7695-5117-3. Links | BibTeX @inproceedings{Yang:2013:IIV:2570457.2571069,
title = {iMeter: An Integrated VM Power Model Based on Performance Profiling},
author = {Hailong Yang and Qi Zhao and Zhongzhi Luan and Depei Qian and Ming Xie and Jason Mars and Lingjia Tang},
url = {http://dx.doi.org/10.1109/ICPP.2013.45},
doi = {10.1109/ICPP.2013.45},
isbn = {978-0-7695-5117-3},
year = {2013},
date = {2013-01-01},
booktitle = {Proceedings of the 2013 42Nd International Conference on Parallel Processing},
pages = {359--368},
publisher = {IEEE Computer Society},
address = {Washington, DC, USA},
series = {ICPP '13},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Jason Mars, Lingjia Tang Whare-map: Heterogeneity in "Homogeneous" Warehouse-scale Computers Inproceedings Proceedings of the 40th Annual International Symposium on Computer Architecture, pp. 619–630, ACM, Tel-Aviv, Israel, 2013, ISBN: 978-1-4503-2079-5. Links | BibTeX @inproceedings{Mars:2013:WHH:2485922.2485975,
title = {Whare-map: Heterogeneity in "Homogeneous" Warehouse-scale Computers},
author = {Jason Mars and Lingjia Tang},
url = {http://doi.acm.org/10.1145/2485922.2485975},
doi = {10.1145/2485922.2485975},
isbn = {978-1-4503-2079-5},
year = {2013},
date = {2013-01-01},
booktitle = {Proceedings of the 40th Annual International Symposium on Computer Architecture},
pages = {619--630},
publisher = {ACM},
address = {Tel-Aviv, Israel},
series = {ISCA '13},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Hailong Yang, Alex Breslow, Jason Mars, Lingjia Tang Bubble-flux: Precise Online QoS Management for Increased Utilization in Warehouse Scale Computers Inproceedings Proceedings of the 40th Annual International Symposium on Computer Architecture, pp. 607–618, ACM, Tel-Aviv, Israel, 2013, ISBN: 978-1-4503-2079-5. Links | BibTeX @inproceedings{Yang:2013:BPO:2485922.2485974,
title = {Bubble-flux: Precise Online QoS Management for Increased Utilization in Warehouse Scale Computers},
author = {Hailong Yang and Alex Breslow and Jason Mars and Lingjia Tang},
url = {http://doi.acm.org/10.1145/2485922.2485974},
doi = {10.1145/2485922.2485974},
isbn = {978-1-4503-2079-5},
year = {2013},
date = {2013-01-01},
booktitle = {Proceedings of the 40th Annual International Symposium on Computer Architecture},
pages = {607--618},
publisher = {ACM},
address = {Tel-Aviv, Israel},
series = {ISCA '13},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Lingjia Tang, Jason Mars, Wei Wang, Tanima Dey, Mary Lou Soffa ReQoS: Reactive Static/Dynamic Compilation for QoS in Warehouse Scale Computers Inproceedings Proceedings of the Eighteenth International Conference on Architectural Support for Programming Languages and Operating Systems, pp. 89–100, ACM, Houston, Texas, USA, 2013, ISBN: 978-1-4503-1870-9. Links | BibTeX @inproceedings{Tang:2013:RRS:2451116.2451126,
title = {ReQoS: Reactive Static/Dynamic Compilation for QoS in Warehouse Scale Computers},
author = {Lingjia Tang and Jason Mars and Wei Wang and Tanima Dey and Mary Lou Soffa},
url = {http://doi.acm.org/10.1145/2451116.2451126},
doi = {10.1145/2451116.2451126},
isbn = {978-1-4503-1870-9},
year = {2013},
date = {2013-01-01},
booktitle = {Proceedings of the Eighteenth International Conference on Architectural Support for Programming Languages and Operating Systems},
pages = {89--100},
publisher = {ACM},
address = {Houston, Texas, USA},
series = {ASPLOS '13},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Lingjia Tang, Jason Mars, Xiao Zhang, Robert Hagmann, Robert Hundt, Eric Tune Optimizing Google's Warehouse Scale Computers: The NUMA Experience Inproceedings Proceedings of the 2013 IEEE 19th International Symposium on High Performance Computer Architecture (HPCA), pp. 188–197, IEEE Computer Society, Washington, DC, USA, 2013, ISBN: 978-1-4673-5585-8. Links | BibTeX @inproceedings{Tang:2013:OGW:2495252.2495508,
title = {Optimizing Google's Warehouse Scale Computers: The NUMA Experience},
author = {Lingjia Tang and Jason Mars and Xiao Zhang and Robert Hagmann and Robert Hundt and Eric Tune},
url = {http://dx.doi.org/10.1109/HPCA.2013.6522318},
doi = {10.1109/HPCA.2013.6522318},
isbn = {978-1-4673-5585-8},
year = {2013},
date = {2013-01-01},
booktitle = {Proceedings of the 2013 IEEE 19th International Symposium on High Performance Computer Architecture (HPCA)},
pages = {188--197},
publisher = {IEEE Computer Society},
address = {Washington, DC, USA},
series = {HPCA '13},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Jason Mars, Robert Hundt Scenario based optimization Miscellaneous 2013, (US Patent 8,578,355). BibTeX @misc{mars2013scenario,
title = {Scenario based optimization},
author = {Jason Mars and Robert Hundt},
year = {2013},
date = {2013-01-01},
note = {US Patent 8,578,355},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
|
2012
|
Kristen Walcott-Justice, Jason Mars, Mary Lou Soffa THeME: A System for Testing by Hardware Monitoring Events Inproceedings Proceedings of the 2012 International Symposium on Software Testing and Analysis, pp. 12–22, ACM, Minneapolis, MN, USA, 2012, ISBN: 978-1-4503-1454-1. Links | BibTeX @inproceedings{Walcott-Justice:2012:TST:2338965.2336755,
title = {THeME: A System for Testing by Hardware Monitoring Events},
author = {Kristen Walcott-Justice and Jason Mars and Mary Lou Soffa},
url = {http://doi.acm.org/10.1145/2338965.2336755},
doi = {10.1145/2338965.2336755},
isbn = {978-1-4503-1454-1},
year = {2012},
date = {2012-01-01},
booktitle = {Proceedings of the 2012 International Symposium on Software Testing and Analysis},
pages = {12--22},
publisher = {ACM},
address = {Minneapolis, MN, USA},
series = {ISSTA 2012},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Jason Mars, Naveen Kumar BlockChop: Dynamic Squash Elimination for Hybrid Processor Architecture Inproceedings Proceedings of the 39th Annual International Symposium on Computer Architecture, pp. 536–547, IEEE Computer Society, Portland, Oregon, 2012, ISBN: 978-1-4503-1642-2. Links | BibTeX @inproceedings{Mars:2012:BDS:2337159.2337221,
title = {BlockChop: Dynamic Squash Elimination for Hybrid Processor Architecture},
author = {Jason Mars and Naveen Kumar},
url = {http://dl.acm.org/citation.cfm?id=2337159.2337221},
isbn = {978-1-4503-1642-2},
year = {2012},
date = {2012-01-01},
booktitle = {Proceedings of the 39th Annual International Symposium on Computer Architecture},
pages = {536--547},
publisher = {IEEE Computer Society},
address = {Portland, Oregon},
series = {ISCA '12},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Jason Mars, Lingjia Tang, Kevin Skadron, Mary Lou Soffa, Robert Hundt Increasing Utilization in Modern Warehouse-Scale Computers Using Bubble-Up Journal Article IEEE Micro, 32 (3), pp. 88–99, 2012, ISSN: 0272-1732. Links | BibTeX @article{Mars:2012:IUM:2311639.2311825,
title = {Increasing Utilization in Modern Warehouse-Scale Computers Using Bubble-Up},
author = {Jason Mars and Lingjia Tang and Kevin Skadron and Mary Lou Soffa and Robert Hundt},
url = {http://dx.doi.org/10.1109/MM.2012.22},
doi = {10.1109/MM.2012.22},
issn = {0272-1732},
year = {2012},
date = {2012-01-01},
journal = {IEEE Micro},
volume = {32},
number = {3},
pages = {88--99},
publisher = {IEEE Computer Society Press},
address = {Los Alamitos, CA, USA},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
|
Wei Wang, Tanima Dey, Jason Mars, Lingjia Tang, Jack W Davidson, Mary Lou Soffa Performance Analysis of Thread Mappings with a Holistic View of the Hardware Resources Inproceedings Proceedings of the 2012 IEEE International Symposium on Performance Analysis of Systems & Software, pp. 156–167, IEEE Computer Society, Washington, DC, USA, 2012, ISBN: 978-1-4673-1143-4. Links | BibTeX @inproceedings{Wang:2012:PAT:2310660.2311004,
title = {Performance Analysis of Thread Mappings with a Holistic View of the Hardware Resources},
author = {Wei Wang and Tanima Dey and Jason Mars and Lingjia Tang and Jack W Davidson and Mary Lou Soffa},
url = {http://dx.doi.org/10.1109/ISPASS.2012.6189222},
doi = {10.1109/ISPASS.2012.6189222},
isbn = {978-1-4673-1143-4},
year = {2012},
date = {2012-01-01},
booktitle = {Proceedings of the 2012 IEEE International Symposium on Performance Analysis of Systems & Software},
pages = {156--167},
publisher = {IEEE Computer Society},
address = {Washington, DC, USA},
series = {ISPASS '12},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|