library.bib


@article{Ackermann2009a,
  abstract = {BACKGROUND: Analysis of microarray and other high-throughput data on the basis of gene sets, rather than individual genes, is becoming more important in genomic studies. Correspondingly, a large number of statistical approaches for detecting gene set enrichment have been proposed, but both the interrelations and the relative performance of the various methods are still very much unclear.

RESULTS: We conduct an extensive survey of statistical approaches for gene set analysis and identify a common modular structure underlying most published methods. Based on this finding we propose a general framework for detecting gene set enrichment. This framework provides a meta-theory of gene set analysis that not only helps to gain a better understanding of the relative merits of each embedded approach but also facilitates a principled comparison and offers insights into the relative interplay of the methods.

CONCLUSION: We use this framework to conduct a computer simulation comparing 261 different variants of gene set enrichment procedures and to analyze two experimental data sets. Based on the results we offer recommendations for best practices regarding the choice of effective procedures for gene set enrichment analysis.},
  author = {Ackermann, Marit and Strimmer, Korbinian},
  date = {2009-01},
  doi = {10/dvzd5z},
  eprint = {19192285},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Ackermann and Strimmer - 2009 - A general modular framework for gene set enrichmen.pdf},
  isbn = {1471210510},
  issn = {1471-2105},
  journaltitle = {BMC bioinformatics},
  keywords = {Algorithms,Animals,Computer Simulation,Databases; Genetic,Humans,Models; Statistical,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods},
  pages = {47},
  title = {A General Modular Framework for Gene Set Enrichment Analysis.},
  volume = {10}
}

@article{Adcock1997,
  author = {Adcock, C. J.},
  date = {1997-07},
  doi = {10/csrcsd},
  file = {/Users/ryan/Documents/Zotero Library/Adcock - 1997 - Sample size determination a review.pdf},
  issn = {0039-0526},
  journaltitle = {Journal of the Royal Statistical Society: Series D (The Statistician)},
  keywords = {average coverage criterion,average length criterion,bayes factors,bayesian methods,binomial,coherence,distribution,hypothesis testing,maximum expected utility,mcnemar,multinomial distribution,multivariate analysis,normal distribution,pivots,regression,s test,sample size determination,tolerance intervals,worst},
  number = {2},
  pages = {261-283},
  title = {Sample Size Determination: A Review},
  volume = {46}
}

@article{Aggarwal2005,
  abstract = {Mesenchymal stem cells (MSCs) are multipotent cells found in several adult tissues. Transplanted allogeneic MSCs can be detected in recipients at extended time points, indicating a lack of immune recognition and clearance. As well, a role for bone marrow-derived MSCs in reducing the incidence and severity of graft-versus-host disease (GVHD) during allogeneic transplantation has recently been reported; however, the mechanisms remain to be investigated. We examined the immunomodulatory functions of human MSCs (hMSCs) by coculturing them with purified subpopulations of immune cells and report here that hMSCs altered the cytokine secretion profile of dendritic cells (DCs), naive and effector T cells (T helper 1 [TH1] and TH2), and natural killer (NK) cells to induce a more anti-inflammatory or tolerant phenotype. Specifically, the hMSCs caused mature DCs type 1 (DC1) to decrease tumor necrosis factor {$\alpha$} (TMF-{$\alpha$}) secretion and mature DC2 to increase interleukin-10 (IL-10) secretion; hMSCs caused TH1 cells to decrease interferon {$\gamma$} (IFN-{$\gamma$}) and caused the TH2 cells to increase secretion of IL-4; hMSCs caused an increase in the proportion of regulatory T cells (T Regs) present; and hMSCs decreased secretion of IFN-{$\gamma$} from the NK cells. Mechanistically, the hMSCs produced elevated prostaglandin E2 (PGE2) in co-cultures, and inhibitors of PGE2 production mitigated hMSC-mediafed immune modulation. These data offer insight into the interactions between allogeneic MSCs and immune cells and provide mechanisms likely involved with the in vivo MSC-mediated induction of tolerance that could be therapeutic for reduction of GVHD, rejection, and modulation of inflammation. \textcopyright{} 2005 by The American Society of Hematology.},
  author = {Aggarwal, Sudeepta and Pittenger, Mark F.},
  date = {2005-02-15},
  doi = {10/fnb37s},
  eprint = {15494428},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Aggarwal and Pittenger - 2005 - Human mesenchymal stem cells modulate allogeneic i.pdf},
  issn = {00064971},
  journaltitle = {Blood},
  number = {4},
  pages = {1815-1822},
  title = {Human Mesenchymal Stem Cells Modulate Allogeneic Immune Cell Responses},
  volume = {105}
}

@article{Aitken2018,
  abstract = {Background: CTCF binding to DNA helps partition the mammalian genome into discrete structural and regulatory domains. Complete removal of CTCF from mammalian cells causes catastrophic genome dysregulation, likely due to widespread collapse of 3D chromatin looping and alterations to inter- and intra-TAD interactions within the nucleus. In contrast, Ctcf hemizygous mice with lifelong reduction of CTCF expression are viable, albeit with increased cancer incidence. Here, we exploit chronic Ctcf hemizygosity to reveal its homeostatic roles in maintaining genome function and integrity. Results: We find that Ctcf hemizygous cells show modest but robust changes in almost a thousand sites of genomic CTCF occupancy; these are enriched for lower affinity binding events with weaker evolutionary conservation across the mouse lineage. Furthermore, we observe dysregulation of the expression of several hundred genes, which are concentrated in cancer-related pathways, and are caused by changes in transcriptional regulation. Chromatin structure is preserved but some loop interactions are destabilized; these are often found around differentially expressed genes and their enhancers. Importantly, the transcriptional alterations identified in vitro are recapitulated in mouse tumors and also in human cancers. Conclusions: This multi-dimensional genomic and epigenomic profiling of a Ctcf hemizygous mouse model system shows that chronic depletion of CTCF dysregulates steady-state gene expression by subtly altering transcriptional regulation, changes which can also be observed in primary tumors.},
  author = {Aitken, Sarah J. and Ibarra-Soria, Ximena and Kentepozidou, Elissavet and Flicek, Paul and Feig, Christine and Marioni, John C. and Odom, Duncan T.},
  date = {2018},
  doi = {10/gd3fhd},
  file = {/Users/ryan/Documents/Zotero Library/Aitken et al. - 2018 - CTCF maintains regulatory homeostasis of cancer pa.pdf},
  issn = {1474760X},
  journaltitle = {Genome Biology},
  keywords = {Cancer,Chromatin architecture,Chromatin state,CTCF,Hemizygosity,Transcription},
  number = {1},
  pages = {1-17},
  title = {{{CTCF}} Maintains Regulatory Homeostasis of Cancer Pathways},
  volume = {19}
}

@article{Alexa2006a,
  abstract = {MOTIVATION: The result of a typical microarray experiment is a long list of genes with corresponding expression measurements. This list is only the starting point for a meaningful biological interpretation. Modern methods identify relevant biological processes or functions from gene expression data by scoring the statistical significance of predefined functional gene groups, e.g. based on Gene Ontology (GO). We develop methods that increase the explanatory power of this approach by integrating knowledge about relationships between the GO terms into the calculation of the statistical significance.

RESULTS: We present two novel algorithms that improve GO group scoring using the underlying GO graph topology. The algorithms are evaluated on real and simulated gene expression data. We show that both methods eliminate local dependencies between GO terms and point to relevant areas in the GO graph that remain undetected with state-of-the-art algorithms for scoring functional terms. A simulation study demonstrates that the new methods exhibit a higher level of detecting relevant biological terms than competing methods.},
  author = {Alexa, Adrian and Rahnenf\"uhrer, J\"org and Lengauer, Thomas},
  date = {2006-07-01},
  doi = {10/bzj9v5},
  eprint = {16606683},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Alexa et al. - 2006 - Improved scoring of functional groups from gene ex.pdf},
  issn = {1367-4803},
  journaltitle = {Bioinformatics (Oxford, England)},
  keywords = {Algorithms,Cluster Analysis,Computational Biology,Computational Biology: methods,Databases; Genetic,Gene Expression Profiling,Gene Expression Regulation,Gene Expression Regulation; Neoplastic,Humans,Leukemia,Leukemia: metabolism,Models; Statistical,Oligonucleotide Array Sequence Analysis,Protein Folding},
  number = {13},
  pages = {1600-7},
  title = {Improved Scoring of Functional Groups from Gene Expression Data by Decorrelating {{GO}} Graph Structure.},
  volume = {22}
}

@article{Alexeyenko2012a,
  abstract = {BACKGROUND: Gene-set enrichment analyses (GEA or GSEA) are commonly used for biological characterization of an experimental gene-set. This is done by finding known functional categories, such as pathways or Gene Ontology terms, that are over-represented in the experimental set; the assessment is based on an overlap statistic. Rich biological information in terms of gene interaction network is now widely available, but this topological information is not used by GEA, so there is a need for methods that exploit this type of information in high-throughput data analysis.

RESULTS: We developed a method of network enrichment analysis (NEA) that extends the overlap statistic in GEA to network links between genes in the experimental set and those in the functional categories. For the crucial step in statistical inference, we developed a fast network randomization algorithm in order to obtain the distribution of any network statistic under the null hypothesis of no association between an experimental gene-set and a functional category. We illustrate the NEA method using gene and protein expression data from a lung cancer study.

CONCLUSIONS: The results indicate that the NEA method is more powerful than the traditional GEA, primarily because the relationships between gene sets were more strongly captured by network connectivity rather than by simple overlaps.},
  author = {Alexeyenko, Andrey and Lee, Woojoo and Pernemalm, Maria and Guegan, Justin and Dessen, Philippe and Lazar, Vladimir and Lehti\"o, Janne and Pawitan, Yudi},
  date = {2012-01},
  doi = {10/f22bh8},
  eprint = {22966941},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Alexeyenko et al. - 2012 - Network enrichment analysis extension of gene-set.pdf},
  issn = {1471-2105},
  journaltitle = {BMC bioinformatics},
  keywords = {Algorithms,Gene Expression,Gene Expression Profiling,Gene Expression Profiling: methods,Gene Regulatory Networks,Humans,Lung Neoplasms,Lung Neoplasms: genetics,Lung Neoplasms: metabolism,Protein Biosynthesis,Proteomics,Proteomics: methods},
  pages = {226},
  title = {Network Enrichment Analysis: Extension of Gene-Set Enrichment Analysis to Gene Networks.},
  volume = {13}
}

@article{Allison2006,
  abstract = {In just a few years, microarrays have gone from obscurity to being almost ubiquitous in biological research. At the same time, the statistical methodology for microarray analysis has progressed from simple visual assessments of results to a weekly deluge of papers that describe purportedly novel algorithms for analysing changes in gene expression. Although the many procedures that are available might be bewildering to biologists who wish to apply them, statistical geneticists are recognizing commonalities among the different methods. Many are special cases of more general models, and points of consensus are emerging about the general approaches that warrant use and elaboration.},
  author = {Allison, David B and Cui, Xiangqin and Page, Grier P and Sabripour, Mahyar},
  date = {2006-01},
  doi = {10/frn2j4},
  eprint = {16369572},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Allison et al. - 2006 - Microarray data analysis from disarray to consoli.pdf},
  issn = {1471-0056},
  journaltitle = {Nature reviews. Genetics},
  keywords = {Algorithms,Cluster Analysis,Computational Biology,Computational Biology: methods,Computer Simulation,Data Interpretation; Statistical,DNA; Complementary,DNA; Complementary: metabolism,Gene Expression Profiling,Gene Expression Profiling: methods,Gene Expression Regulation,Genetic Techniques,Genetics,Humans,Microarray Analysis,Models; Biological,Models; Statistical,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,RNA; Messenger,RNA; Messenger: metabolism},
  number = {1},
  pages = {55-65},
  title = {Microarray Data Analysis: From Disarray to Consolidation and Consensus.},
  volume = {7}
}

@article{Amemiya2019,
  abstract = {Functional genomics assays based on high-throughput sequencing greatly expand our ability to understand the genome. Here, we define the ENCODE blacklist- a comprehensive set of regions in the human, mouse, worm, and fly genomes that have anomalous, unstructured, or high signal in next-generation sequencing experiments independent of cell line or experiment. The removal of the ENCODE blacklist is an essential quality measure when analyzing functional genomics data.},
  author = {Amemiya, Haley M. and Kundaje, Anshul and Boyle, Alan P.},
  date = {2019},
  doi = {10/gf4jsb},
  file = {/Users/ryan/Documents/Zotero Library/Amemiya et al. - 2019 - The ENCODE Blacklist Identification of Problemati.pdf},
  isbn = {4159801945839},
  issn = {20452322},
  journaltitle = {Scientific Reports},
  number = {1},
  pages = {1-5},
  title = {The {{ENCODE Blacklist}}: {{Identification}} of {{Problematic Regions}} of the {{Genome}}},
  volume = {9}
}

@article{Anders2010,
  abstract = {High-throughput sequencing assays such as RNA-Seq, ChIP-Seq or barcode counting provide quantitative readouts in the form of count data. To infer differential signal in such data correctly and with good statistical power, estimation of data variability throughout the dynamic range and a suitable error model are required. We propose a method based on the negative binomial distribution, with variance and mean linked by local regression and present an implementation, DESeq, as an R/Bioconductor package.},
  author = {Anders, Simon and Huber, Wolfgang},
  date = {2010-10-27},
  doi = {10/btmbk5},
  eprint = {20979621},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Anders and Huber - 2010 - Differential expression analysis for sequence coun.pdf;/Users/ryan/Documents/Zotero Library/Anders and Huber - 2010 - Differential expression analysis for sequence coun2.pdf;/Users/ryan/Documents/Zotero Library/Anders and Huber - 2010 - Differential expression analysis for sequence coun3.pdf;/Users/ryan/Zotero/storage/I65673QD/gb-2010-11-10-r106.html},
  issn = {1474-760X},
  journaltitle = {Genome Biology},
  keywords = {Animals,Binomial Distribution,Chromatin Immunoprecipitation,Chromatin Immunoprecipitation: methods,Computational Biology,Computational Biology: methods,Drosophila,Drosophila: genetics,Gene Expression Profiling,Gene Expression Profiling: methods,Genetic,High-Throughput Nucleotide Sequencing,High-Throughput Nucleotide Sequencing: methods,Linear Models,Models,RNA,RNA: methods,Saccharomyces cerevisiae,Saccharomyces cerevisiae: genetics,Sequence Analysis,Stem Cells,Tissue Culture Techniques},
  number = {10},
  pages = {R106},
  shortjournal = {Genome Biology},
  title = {Differential Expression Analysis for Sequence Count Data},
  volume = {11}
}

@article{Anders2012,
  abstract = {RNA-Seq is a powerful tool for the study of alternative splicing and other forms of alternative isoform expression. Understanding the regulation of these processes requires sensitive and specific detection of differential iso- form abundance in comparisons between conditions, cell types or tissues. We present DEXSeq, a statistical method to test for differential exon usage in RNA-Seq data. DEXSeq employs generalized linear models and offers re- liable control of false discoveries by taking biological variation into account. DEXSeq detect genes, and in many cases specific exons, that are subject to differential exon usage with high sensitivity. We demonstrate the versatility of DEXSeq by applying it to several data sets. The method facilitates the study of regulation and function of alternative exon usage on a genome-wide scale. An implementation of DEXSeq is available as an R/Bioconductor package.},
  author = {Anders, Simon and Reyes, Alejandro and Huber, Wolfgang},
  date = {2012},
  doi = {10/ggcxjt},
  file = {/Users/ryan/Documents/Zotero Library/Anders et al. - 2012 - Detecting differential usage of exons from RNA-seq.pdf},
  journaltitle = {Genome Research},
  pages = {1-30},
  title = {Detecting Differential Usage of Exons from {{RNA}}-Seq Data}
}

@online{Anders2013,
  author = {Anders, Simon},
  date = {2013},
  keywords = {\#nosource},
  title = {{{HTSeq}}: {{Analysing}} High-Throughput Sequencing Data with {{Python}}},
  url = {http://www-huber.embl.de/users/anders/HTSeq/doc/index.html}
}

@article{Anders2013a,
  abstract = {RNA sequencing (RNA-seq) has been rapidly adopted for the profiling of transcriptomes in many areas of biology, including studies into gene regulation, development and disease. Of particular interest is the discovery of differentially expressed genes across different conditions (e.g., tissues, perturbations) while optionally adjusting for other systematic factors that affect the data-collection process. There are a number of subtle yet crucial aspects of these analyses, such as read counting, appropriate treatment of biological variability, quality control checks and appropriate setup of statistical modeling. Several variations have been presented in the literature, and there is a need for guidance on current best practices. This protocol presents a state-of-the-art computational and statistical RNA-seq differential expression analysis workflow largely based on the free open-source R language and Bioconductor software and, in particular, on two widely used tools, DESeq and edgeR. Hands-on time for typical small experiments (e.g., 4-10 samples) can be {$<$}1 h, with computation time {$<$}1 d using a standard desktop PC.},
  author = {Anders, Simon and McCarthy, Davis J and Chen, Yunshun and Okoniewski, Michal and Smyth, Gordon K and Huber, Wolfgang and Robinson, Mark D},
  date = {2013-09},
  doi = {10/f4794j},
  eprint = {23975260},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Anders et al. - 2013 - Count-based differential expression analysis of RN.pdf},
  issn = {1750-2799},
  journaltitle = {Nature protocols},
  keywords = {Base Sequence,Computational Biology,Computational Biology: methods,Gene Expression Profiling,Gene Expression Profiling: methods,Sequence Analysis; RNA,Sequence Analysis; RNA: methods,Software,Workflow},
  number = {9},
  pages = {1765-86},
  title = {Count-Based Differential Expression Analysis of {{RNA}} Sequencing Data Using {{R}} and {{Bioconductor}}.},
  volume = {8}
}

@article{Anders2014,
  abstract = {Motivation: A large choice of tools exists for many standard tasks in the analysis of high-throughput sequencing (HTS) data. However, once a project deviates from standard work flows, custom scripts are needed. Results: We present HTSeq, a Python library to facilitate the rapid development of such scripts. HTSeq offers parsers for many common data formats in HTS projects, as well as classes to represent data such as genomic coordinates, sequences, sequencing reads, alignments, gene model information, variant calls, and provides data structures that allow for querying via genomic coordinates. We also present htseq-count, a tool developed with HTSeq that preprocesses RNA-Seq data for differential expression analysis by counting the overlap of reads with genes. Availability: HTSeq is released as open-source software under the GNU General Public Licence and available from http://www-huber.embl.de/HTSeq or from the Python Package Index, https://pypi.python.org/pypi/HTSeq},
  author = {Anders, S. and Pyl, P. T. and Huber, W.},
  date = {2015-01-15},
  doi = {10/f6v7kx},
  eprint = {25260700},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Anders et al. - 2015 - HTSeq--a Python framework to work with high-throug.pdf},
  isbn = {1367-4811 (Electronic) 1367-4803 (Linking)},
  issn = {1367-4803},
  journaltitle = {Bioinformatics},
  number = {2},
  pages = {166-169},
  title = {{{HTSeq}}--a {{Python}} Framework to Work with High-Throughput Sequencing Data},
  volume = {31}
}

@article{Ankrum2014,
  abstract = {The diverse immunomodulatory properties of mesenchymal stem/stromal cells (MSCs) may be exploited for treatment of a multitude of inflammatory conditions. MSCs have long been reported to be hypoimmunogenic or 'immune privileged'; this property is thought to enable MSC transplantation across major histocompatibility barriers and the creation of off-the-shelf therapies consisting of MSCs grown in culture. However, recent studies describing generation of antibodies against and immune rejection of allogeneic donor MSCs suggest that MSCs may not actually be immune privileged. Nevertheless, whether rejection of donor MSCs influences the efficacy of allogeneic MSC therapies is not known, and no definitive clinical advantage of autologous MSCs over allogeneic MSCs has been demonstrated to date. Although MSCs may exert therapeutic function through a brief 'hit and run' mechanism, protecting MSCs from immune detection and prolonging their persistence in vivo may improve clinical outcomes and prevent patient sensitization toward donor antigens. \textcopyright{} 2014 Nature America, Inc.},
  author = {Ankrum, James A. and Ong, Joon Faii and Karp, Jeffrey M.},
  date = {2014},
  doi = {10/f5vjkk},
  file = {/Users/ryan/Documents/Zotero Library/Ankrum et al. - 2014 - Mesenchymal stem cells Immune evasive, not immune.pdf},
  issn = {15461696},
  journaltitle = {Nature Biotechnology},
  number = {3},
  pages = {252-260},
  title = {Mesenchymal Stem Cells: {{Immune}} Evasive, Not Immune Privileged},
  volume = {32}
}

@article{Aponte2011,
  abstract = {Two intermingled hypothalamic neuron populations specified by expression of agouti-related peptide (AGRP) or pro-opiomelanocortin (POMC) positively and negatively influence feeding behavior, respectively, possibly by reciprocally regulating downstream melanocortin receptors. However, the sufficiency of these neurons to control behavior and the relationship of their activity to the magnitude and dynamics of feeding are unknown. To measure this, we used channelrhodopsin-2 for cell type-specific photostimulation. Activation of only 800 AGRP neurons in mice evoked voracious feeding within minutes. The behavioral response increased with photoexcitable neuron number, photostimulation frequency and stimulus duration. Conversely, POMC neuron stimulation reduced food intake and body weight, which required melanocortin receptor signaling. However, AGRP neuron-mediated feeding was not dependent on suppressing this melanocortin pathway, indicating that AGRP neurons directly engage feeding circuits. Furthermore, feeding was evoked selectively over drinking without training or prior photostimulus exposure, which suggests that AGRP neurons serve a dedicated role coordinating this complex behavior.},
  author = {Aponte, Yexica and Atasoy, Deniz and Sternson, Scott M},
  date = {2011-03},
  doi = {10/fwh8kz},
  eprint = {21209617},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Aponte et al. - 2011 - AGRP neurons are sufficient to orchestrate feeding.pdf},
  issn = {1546-1726},
  journaltitle = {Nature neuroscience},
  keywords = {Agouti-Related Protein,Agouti-Related Protein: genetics,Agouti-Related Protein: metabolism,Animal,Animal: physiology,Animals,Behavior,Classical,Classical: physiology,Conditioning,Eating,Feeding Behavior,Feeding Behavior: physiology,Hypothalamus,Hypothalamus: cytology,Hypothalamus: metabolism,Melanocortins,Melanocortins: metabolism,Mice,Neurons,Neurons: metabolism,Photic Stimulation,Pro-Opiomelanocortin,Pro-Opiomelanocortin: metabolism,Recombinant Fusion Proteins,Recombinant Fusion Proteins: genetics,Recombinant Fusion Proteins: metabolism,Rhodopsin,Rhodopsin: genetics,Rhodopsin: metabolism},
  number = {3},
  pages = {351-5},
  title = {{{AGRP}} Neurons Are Sufficient to Orchestrate Feeding Behavior Rapidly and without Training.},
  volume = {14}
}

@article{Argelaguet,
  author = {Argelaguet, Ricard and Velten, Britta and Arnol, Damien and Dietrich, Sascha and Zenz, Thorsten and Marioni, John C and Buettner, Florian and Huber, Wolfgang and Stegle, Oliver},
  file = {/Users/ryan/Documents/Zotero Library/Argelaguet et al. - Methods for  Multi-Omics factor analysis disentan.pdf},
  pages = {1-16},
  title = {Methods for : {{Multi}}-{{Omics}} Factor Analysis Disentangles Heterogeneity in Blood Cancer {{Multi}}-{{Omics Factor Analysis}} Model}
}

@article{Argelaguet2017b,
  author = {Argelaguet, Ricard and Velten, Britta and Arnol, Damien and Dietrich, Sascha and Zenz, Thorsten and Marioni, John C and Buettner, Florian and Huber, Wolfgang and Stegle, Oliver},
  date = {2017},
  doi = {10/gfvttf},
  file = {/Users/ryan/Documents/Zotero Library/Argelaguet et al. - 2017 - Multi-Omics factor analysis disentangles heterogen.pdf},
  issue = {Cll},
  pages = {1-16},
  title = {Multi-{{Omics}} Factor Analysis Disentangles Heterogeneity in \mbox{} \mbox{} Blood \mbox{} \mbox{} Cancer}
}

@article{Argelaguet2018,
  abstract = {Multi-omics studies promise the improved characterization of biological processes across molecular layers. However, methods for the unsupervised integration of the resulting heterogeneous data sets are lacking. We present Multi-Omics Factor Analysis (MOFA), a computational method for discovering the principal sources of variation in multi-omics data sets. MOFA infers a set of (hidden) factors that capture biological and technical sources of variability. It disentangles axes of heterogeneity that are shared across multiple modalities and those specific to individual data modalities. The learnt factors enable a variety of downstream analyses, including identification of sample subgroups, data imputation and the detection of outlier samples. We applied MOFA to a cohort of 200 patient samples of chronic lymphocytic leukaemia, profiled for somatic mutations, RNA expression, DNA methylation and ex vivo drug responses. MOFA identified major dimensions of disease heterogeneity, including immunoglobulin heavy-chain variable region status, trisomy of chromosome 12 and previously underappreciated drivers, such as response to oxidative stress. In a second application, we used MOFA to analyse single-cell multi-omics data, identifying coordinated transcriptional and epigenetic changes along cell differentiation.},
  author = {Argelaguet, Ricard and Velten, Britta and Arnol, Damien and Dietrich, Sascha and Zenz, Thorsten and Marioni, John C. and Buettner, Florian and Huber, Wolfgang and Stegle, Oliver},
  date = {2018-06-20},
  doi = {10/gdqq3f},
  eprint = {29925568},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Argelaguet et al. - 2018 - Multi‐Omics Factor Analysis—a framework for unsupe.pdf},
  issn = {1744-4292},
  journaltitle = {Molecular Systems Biology},
  keywords = {biology,data integration,dimensionality reduction,genome-scale,integrative,methods,multi-omics,personalized medicine,resources,single-cell omics,subject categories computational biology},
  number = {6},
  pages = {1-13},
  title = {Multi-{{Omics Factor Analysis}}\textemdash{}a Framework for Unsupervised Integration of Multi-omics Data Sets},
  volume = {14}
}

@article{Arnaud2016,
  abstract = {Transcriptome studies based on quantitative sequencing can estimate levels of gene expression by measuring target RNA abundance in sequencing libraries. Sequencing costs are proportional to the total number of sequenced reads, and in order to cover rare RNAs, considerable quantities of abundant and identical reads are needed. This major limitation can be addressed by depleting a proportion of the most abundant sequences from the library. However, such depletion strategies involve either extra handling of the input RNA sample or use of a large number of reverse transcription primers, termed not-so-random (NSR) primers, which are costly to synthesize. Taking advantage of the high tolerance of reverse transcriptase to mis-prime, we found that it is possible to use as few as 40 pseudo-random (PS) reverse transcription primers to decrease the rate of undesirable abundant sequences within a library without affecting the overall transcriptome diversity. PS primers are simple to design and can be used to deplete several undesirable RNAs simultaneously, thus creating a flexible tool for enriching transcriptome libraries for rare transcript sequences.},
  author = {Arnaud, Oph\'elie and Kato, Sachi and Poulain, St\'ephane and Plessy, Charles},
  date = {2016-04-01},
  doi = {10/ggcxjv},
  eprint = {27071605},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Arnaud et al. - 2016 - Targeted reduction of highly abundant transcripts .pdf},
  issn = {1940-9818},
  journaltitle = {BioTechniques},
  keywords = {high-throughput sequencing,nanoCAGE,rRNA,undesirable sequences},
  number = {4},
  pages = {169-74},
  title = {Targeted Reduction of Highly Abundant Transcripts Using Pseudo-Random Primers},
  volume = {60}
}

@article{Aryee2014,
  abstract = {Motivation: The recently released Infinium HumanMethylation450 array (the '450k' array) provides a high-throughput assay to quantify DNA methylation (DNAm) at {$\sim$}450 000 loci across a range of genomic features. Although less comprehensive than high-throughput sequencing-based techniques, this product is more cost-effective and promises to be the most widely used DNAm high-throughput measurement technology over the next several years. Results: Here we describe a suite of computational tools that incorporate state-of-the-art statistical techniques for the analysis of DNAm data. The software is structured to easily adapt to future versions of the technology. We include methods for preprocessing, quality assessment and detection of differentially methylated regions from the kilobase to the megabase scale. We show how our software provides a powerful and flexible development platform for future methods. We also illustrate how our methods empower the technology to make discoveries previously thought to be possible only with sequencing-based methods. \textcopyright{} The Author 2014.},
  author = {Aryee, Martin J. and Jaffe, Andrew E. and Corrada-Bravo, Hector and Ladd-Acosta, Christine and Feinberg, Andrew P. and Hansen, Kasper D. and Irizarry, Rafael A.},
  date = {2014-05-15},
  doi = {10/f3m42q},
  file = {/Users/ryan/Documents/Zotero Library/Aryee et al. - 2014 - Minfi A flexible and comprehensive Bioconductor p.pdf},
  issn = {14602059},
  journaltitle = {Bioinformatics},
  number = {10},
  pages = {1363-1369},
  title = {Minfi: {{A}} Flexible and Comprehensive {{Bioconductor}} Package for the Analysis of {{Infinium DNA}} Methylation Microarrays},
  volume = {30}
}

@article{Aschoff2013,
  abstract = {MOTIVATION: Alternative splicing is central for cellular processes and substantially increases transcriptome and proteome diversity. Aberrant splicing events often have pathological consequences and are associated with various diseases and cancer types. The emergence of next-generation RNA sequencing (RNA-seq) provides an exciting new technology to analyse alternative splicing on a large scale. However, algorithms that enable the analysis of alternative splicing from short-read sequencing are not fully established yet and there are still no standard solutions available for a variety of data analysis tasks.

RESULTS: We present a new method and software to predict genes that are differentially spliced between two different conditions using RNA-seq data. Our method uses geometric angles between the high dimensional vectors of exon read counts. With this, differential splicing can be detected even if the splicing events are composed of higher complexity and involve previously unknown splicing patterns. We applied our approach to two case studies including neuroblastoma tumour data with favourable and unfavourable clinical courses. We show the validity of our predictions as well as the applicability of our method in the context of patient clustering. We verified our predictions by several methods including simulated experiments and complementary in silico analyses. We found a significant number of exons with specific regulatory splicing factor motifs for predicted genes and a substantial number of publications linking those genes to alternative splicing. Furthermore, we could successfully exploit splicing information to cluster tissues and patients. Finally, we found additional evidence of splicing diversity for many predicted genes in normalized read coverage plots and in reads that span exon-exon junctions.

AVAILABILITY: SplicingCompass is licensed under the GNU GPL and freely available as a package in the statistical language R at http://www.ichip.de/software/SplicingCompass.html},
  author = {Aschoff, Moritz and Hotz-Wagenblatt, Agnes and Glatting, Karl-Heinz and Fischer, Matthias and Eils, Roland and K\"onig, Rainer},
  date = {2013-05-01},
  doi = {10/f4w547},
  eprint = {23449093},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Aschoff et al. - 2013 - SplicingCompass differential splicing detection u.pdf},
  issn = {1367-4811},
  journaltitle = {Bioinformatics (Oxford, England)},
  number = {9},
  pages = {1141-8},
  title = {{{SplicingCompass}}: Differential Splicing Detection Using {{RNA}}-Seq Data.},
  volume = {29}
}

@article{Au2013,
  abstract = {Although transcriptional and posttranscriptional events are detected in RNA-Seq data from second-generation sequencing, full-length mRNA isoforms are not captured. On the other hand, third-generation sequencing, which yields much longer reads, has current limitations of lower raw accuracy and throughput. Here, we combine second-generation sequencing and third-generation sequencing with a custom-designed method for isoform identification and quantification to generate a high-confidence isoform dataset for human embryonic stem cells (hESCs). We report 8,084 RefSeq-annotated isoforms detected as full-length and an additional 5,459 isoforms predicted through statistical inference. Over one-third of these are novel isoforms, including 273 RNAs from gene loci that have not previously been identified. Further characterization of the novel loci indicates that a subset is expressed in pluripotent cells but not in diverse fetal and adult tissues; moreover, their reduced expression perturbs the network of pluripotency-associated genes. Results suggest that gene identification, even in well-characterized human cell lines and tissues, is likely far from complete.},
  author = {Au, Kin Fai and Sebastiano, Vittorio and Afshar, Pegah Tootoonchi and Durruthy, Jens Durruthy and Lee, Lawrence and a Williams, Brian and van Bakel, Harm and Schadt, Eric E and a Reijo-Pera, Renee and Underwood, Jason G and Wong, Wing Hung},
  date = {2013-11-26},
  doi = {10/f5jn67},
  eprint = {24282307},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Au et al. - 2013 - Characterization of the human ESC transcriptome by.pdf},
  issn = {1091-6490},
  journaltitle = {Proceedings of the National Academy of Sciences of the United States of America},
  options = {useprefix=true},
  title = {Characterization of the Human {{ESC}} Transcriptome by Hybrid Sequencing.}
}

@article{Auer2010,
  abstract = {Next-generation sequencing technologies are quickly becoming the preferred approach for characterizing and quantifying entire genomes. Even though data produced from these technologies are proving to be the most informative of any thus far, very little attention has been paid to fundamental design aspects of data collection and analysis, namely sampling, randomization, replication, and blocking. We discuss these concepts in an RNA sequencing framework. Using simulations we demonstrate the benefits of collecting replicated RNA sequencing data according to well known statistical designs that partition the sources of biological and technical variation. Examples of these designs and their corresponding models are presented with the goal of testing differential expression.},
  author = {Auer, Paul L and Doerge, R W},
  date = {2010-06},
  doi = {10/dqrqxw},
  eprint = {20439781},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Auer and Doerge - 2010 - Statistical design and analysis of RNA sequencing .pdf},
  issn = {1943-2631},
  journaltitle = {Genetics},
  keywords = {Base Sequence,Clinical Laboratory Techniques,Research,Research: methods},
  number = {2},
  pages = {405-16},
  title = {Statistical Design and Analysis of {{RNA}} Sequencing Data.},
  volume = {185}
}

@article{aulettaPerspectivesEmergingRoles,
  abstract = {Abstract
            Multipotent, bone marrow\textendash{}derived stromal cells (BMSCs, also known as mesenchymal stem cells [MSCs]), are culture-expanded, nonhematopoietic cells with immunomodulatory effects currently being investigated as novel cellular therapy to prevent and to treat clinical disease associated with aberrant immune response. Emerging preclinical studies suggest that BMSCs may protect against infectious challenge either by direct effects on the pathogen or through indirect effects on the host. BMSCs may reduce pathogen burden by inhibiting growth through soluble factors or by enhancing immune cell antimicrobial function. In the host, BMSCs may attenuate pro-inflammatory cytokine and chemokine induction, reduce pro-inflammatory cell migration into sites of injury and infection, and induce immunoregulatory soluble and cellular factors to preserve organ function. These preclinical studies provide provocative hints into the direction MSC therapeutics may take in the future. Notably, BMSCs appear to function as a critical fulcrum, providing balance by promoting pathogen clearance during the initial inflammatory response while suppressing inflammation to preserve host integrity and facilitate tissue repair. Such exquisite balance in BMSC function appears intrinsically linked to Toll-like receptor signaling and immune crosstalk.},
  author = {Auletta, Jeffery J. and Deans, Robert J. and Bartholomew, Amelia M.},
  doi = {10/fzmb9t},
  file = {/Users/ryan/Documents/Zotero Library/Auletta et al. - Perspectives Emerging roles for multipotent, bone.pdf},
  ids = {aulettaEmergingRolesMultipotent2012},
  journaltitle = {Cell},
  keywords = {cyno-project},
  pages = {1-33},
  title = {Perspectives: {{Emerging}} Roles for Multipotent, Bone Marrow-Derived Stromal Cells in Host Defense},
  volume = {598}
}

@article{Bailey2013,
  abstract = {Mapping the chromosomal locations of transcription factors, nucleosomes, histone modifications, chromatin remodeling enzymes, chaperones, and polymerases is one of the key tasks of modern biology, as evidenced by the Encyclopedia of DNA Elements (ENCODE) Project. To this end, chromatin immunoprecipitation followed by high-throughput sequencing (ChIP-seq) is the standard methodology. Mapping such protein-DNA interactions in vivo using ChIP-seq presents multiple challenges not only in sample preparation and sequencing but also for computational analysis. Here, we present step-by-step guidelines for the computational analysis of ChIP-seq data. We address all the major steps in the analysis of ChIP-seq data: sequencing depth selection, quality checking, mapping, data normalization, assessment of reproducibility, peak calling, differential binding analysis, controlling the false discovery rate, peak annotation, visualization, and motif analysis. At each step in our guidelines we discuss some of the software tools most frequently used. We also highlight the challenges and problems associated with each step in ChIP-seq data analysis. We present a concise workflow for the analysis of ChIP-seq data in Figure 1 that complements and expands on the recommendations of the ENCODE and modENCODE projects. Each step in the workflow is described in detail in the following sections.},
  author = {Bailey, Timothy and Krajewski, Pawel and Ladunga, Istvan and Lefebvre, Celine and Li, Qunhua and Liu, Tao and Madrigal, Pedro and Taslim, Cenny and Zhang, Jie},
  date = {2013-11},
  doi = {10/gfr9pq},
  eprint = {24244136},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Bailey et al. - 2013 - Practical guidelines for the comprehensive analysi.pdf},
  issn = {1553-7358},
  journaltitle = {PLoS computational biology},
  number = {11},
  pages = {e1003326},
  title = {Practical Guidelines for the Comprehensive Analysis of {{ChIP}}-Seq Data.},
  volume = {9}
}

@article{Bair2004,
  abstract = {An important goal of DNA microarray research is to develop tools to diagnose cancer more accurately based on the genetic profile of a tumor. There are several existing techniques in the literature for performing this type of diagnosis. Unfortunately, most of these techniques assume that different subtypes of cancer are already known to exist. Their utility is limited when such subtypes have not been previously identified. Although methods for identifying such subtypes exist, these methods do not work well for all datasets. It would be desirable to develop a procedure to find such subtypes that is applicable in a wide variety of circumstances. Even if no information is known about possible subtypes of a certain form of cancer, clinical information about the patients, such as their survival time, is often available. In this study, we develop some procedures that utilize both the gene expression data and the clinical data to identify subtypes of cancer and use this knowledge to diagnose future patients. These procedures were successfully applied to several publicly available datasets. We present diagnostic procedures that accurately predict the survival of future patients based on the gene expression profile and survival times of previous patients. This has the potential to be a powerful tool for diagnosing and treating cancer.},
  author = {Bair, Eric and Tibshirani, Robert},
  date = {2004-04},
  doi = {10/c4qv69},
  eprint = {15094809},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Bair and Tibshirani - 2004 - Semi-supervised methods to predict patient surviva.pdf},
  issn = {1545-7885},
  journaltitle = {PLoS biology},
  keywords = {Breast Neoplasms,Breast Neoplasms: metabolism,Breast Neoplasms: mortality,Cluster Analysis,Computer Simulation,Data Interpretation; Statistical,Databases; Factual,Gene Expression Profiling,Humans,Models; Statistical,Neoplasms,Neoplasms: metabolism,Neoplasms: mortality,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,Principal Component Analysis,Prognosis,Software,Time Factors,Treatment Outcome},
  number = {4},
  pages = {E108},
  title = {Semi-Supervised Methods to Predict Patient Survival from Gene Expression Data.},
  volume = {2}
}

@article{Barbie2009,
  abstract = {The proto-oncogene KRAS is mutated in a wide array of human cancers, most of which are aggressive and respond poorly to standard therapies. Although the identification of specific oncogenes has led to the development of clinically effective, molecularly targeted therapies in some cases, KRAS has remained refractory to this approach. A complementary strategy for targeting KRAS is to identify gene products that, when inhibited, result in cell death only in the presence of an oncogenic allele. Here we have used systematic RNA interference to detect synthetic lethal partners of oncogenic KRAS and found that the non-canonical IkappaB kinase TBK1 was selectively essential in cells that contain mutant KRAS. Suppression of TBK1 induced apoptosis specifically in human cancer cell lines that depend on oncogenic KRAS expression. In these cells, TBK1 activated NF-kappaB anti-apoptotic signals involving c-Rel and BCL-XL (also known as BCL2L1) that were essential for survival, providing mechanistic insights into this synthetic lethal interaction. These observations indicate that TBK1 and NF-kappaB signalling are essential in KRAS mutant tumours, and establish a general approach for the rational identification of co-dependent pathways in cancer.},
  author = {a Barbie, David and Tamayo, Pablo and Boehm, Jesse S and Kim, So Young and Moody, Susan E and Dunn, Ian F and Schinzel, Anna C and Sandy, Peter and Meylan, Etienne and Scholl, Claudia and Fr\"ohling, Stefan and Chan, Edmond M and Sos, Martin L and Michel, Kathrin and Mermel, Craig and Silver, Serena J and a Weir, Barbara and Reiling, Jan H and Sheng, Qing and Gupta, Piyush B and Wadlow, Raymond C and Le, Hanh and Hoersch, Sebastian and Wittner, Ben S and Ramaswamy, Sridhar and Livingston, David M and Sabatini, David M and Meyerson, Matthew and Thomas, Roman K and Lander, Eric S and Mesirov, Jill P and Root, David E and Gilliland, D Gary and Jacks, Tyler and Hahn, William C},
  date = {2009-11-05},
  doi = {10/frdz3h},
  eprint = {19847166},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Barbie et al. - 2009 - Systematic RNA interference reveals that oncogenic.pdf},
  issn = {1476-4687},
  journaltitle = {Nature},
  keywords = {Alleles,Apoptosis,bcl-X Protein,bcl-X Protein: metabolism,Cell Line; Tumor,Cell Survival,Gene Expression Profiling,Genes; Lethal,Genes; ras,Genes; ras: genetics,Humans,Lung Neoplasms,Lung Neoplasms: genetics,Lung Neoplasms: metabolism,Lung Neoplasms: pathology,Neoplasms,Neoplasms: genetics,Neoplasms: metabolism,Neoplasms: pathology,Oncogene Protein p21(ras),Oncogene Protein p21(ras): genetics,Oncogene Protein p21(ras): metabolism,Protein-Serine-Threonine Kinases,Protein-Serine-Threonine Kinases: antagonists & in,Protein-Serine-Threonine Kinases: metabolism,Proto-Oncogene Proteins c-rel,Proto-Oncogene Proteins c-rel: metabolism,RNA Interference,Signal Transduction},
  number = {7269},
  pages = {108-12},
  title = {Systematic {{RNA}} Interference Reveals That Oncogenic {{KRAS}}-Driven Cancers Require {{TBK1}}.},
  volume = {462}
}

@article{Barski2007,
  abstract = {Histone modifications are implicated in influencing gene expression. We have generated high-resolution maps for the genome-wide distribution of 20 histone lysine and arginine methylations as well as histone variant H2A.Z, RNA polymerase II, and the insulator binding protein CTCF across the human genome using the Solexa 1G sequencing technology. Typical patterns of histone methylations exhibited at promoters, insulators, enhancers, and transcribed regions are identified. The monomethylations of H3K27, H3K9, H4K20, H3K79, and H2BK5 are all linked to gene activation, whereas trimethylations of H3K27, H3K9, and H3K79 are linked to repression. H2A.Z associates with functional regulatory elements, and CTCF marks boundaries of histone methylation domains. Chromosome banding patterns are correlated with unique patterns of histone modifications. Chromosome breakpoints detected in T cell cancers frequently reside in chromatin regions associated with H3K4 methylations. Our data provide new insights into the function of histone methylation and chromatin organization in genome function.},
  author = {Barski, Artem and Cuddapah, Suresh and Cui, Kairong and Roh, Tae-Young and Schones, Dustin E and Wang, Zhibin and Wei, Gang and Chepelev, Iouri and Zhao, Keji},
  date = {2007-05-18},
  doi = {10/dvv94h},
  eprint = {17512414},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Barski et al. - 2007 - High-resolution profiling of histone methylations .pdf},
  issn = {0092-8674},
  journaltitle = {Cell},
  keywords = {Chromatin,Chromatin: genetics,Chromatin: ultrastructure,Chromosome Breakage,Enhancer Elements; Genetic,Enhancer Elements; Genetic: genetics,Epigenesis; Genetic,Epigenesis; Genetic: genetics,Gene Expression Profiling,Gene Expression Profiling: methods,Gene Expression Regulation,Gene Expression Regulation: genetics,Genome; Human,Genome; Human: genetics,Histone-Lysine N-Methyltransferase,Histone-Lysine N-Methyltransferase: metabolism,Histones,Histones: genetics,Histones: metabolism,Humans,Lymphoma,Lymphoma: genetics,Methylation,Promoter Regions; Genetic,Promoter Regions; Genetic: genetics,Protein Methyltransferases,Regulatory Elements; Transcriptional,Regulatory Elements; Transcriptional: genetics,RNA Polymerase II,RNA Polymerase II: metabolism,Transcriptional Activation,Transcriptional Activation: genetics},
  number = {4},
  pages = {823-37},
  title = {High-Resolution Profiling of Histone Methylations in the Human Genome.},
  volume = {129}
}

@article{Bartholome2009a,
  abstract = {In order to handle and interpret the vast amounts of data produced by microarray experiments, the analysis of sets of genes with a common biological functionality has been shown to be advantageous compared to single gene analyses. Some statistical methods have been proposed to analyse the differential gene expression of gene sets in microarray experiments. However, most of these methods either require threshhold values to be chosen for the analysis, or they need some reference set for the determination of significance. We present a method that estimates the number of differentially expressed genes in a gene set without requiring a threshold value for significance of genes. The method is self-contained (i.e., it does not require a reference set for comparison). In contrast to other methods which are focused on significance, our approach emphasizes the relevance of the regulation of gene sets. The presented method measures the degree of regulation of a gene set and is a useful tool to compare the induction of different gene sets and place the results of microarray experiments into the biological context. An R-package is available.},
  author = {Bartholom\'e, Kilian and Kreutz, Clemens and Timmer, Jens},
  date = {2009-07},
  doi = {10/ftj4zb},
  eprint = {19580524},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Bartholomé et al. - 2009 - Estimation of gene induction enables a relevance-b.pdf},
  issn = {1557-8666},
  journaltitle = {Journal of computational biology : a journal of computational molecular cell biology},
  keywords = {Gene Expression Profiling,Gene Expression Profiling: methods,Gene Expression Regulation,Models; Biological,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods},
  number = {7},
  pages = {959-67},
  title = {Estimation of Gene Induction Enables a Relevance-Based Ranking of Gene Sets.},
  volume = {16}
}

@article{Bartholomew2009,
  abstract = {Mesenchymal stem cells directly suppress ongoing immune responses. Through production of toleragenic cytokines, inhibition of lymphocyte proliferation, delivery of reparative and protective signals after reperfusion injury, and facilitation of hematopoietic chimerism, these cells demonstrate a wide-ranging potential for the development of multifaceted toleragenic strategies after transplantation.},
  author = {Bartholomew, Amelia and Polchert, David and Szilagyi, Erzsebet and Douglas, G. W. and Kenyon, Norma},
  date = {2009-05},
  doi = {10/cf45q7},
  file = {/Users/ryan/Documents/Zotero Library/Bartholomew et al. - 2009 - Mesenchymal stem cells in the induction of transpl.pdf},
  issn = {15346080},
  issue = {9 Suppl},
  journaltitle = {Transplantation},
  keywords = {87,and malignancy,as the ability of,immunoprotective responses to infection,mesenchymal stem cells,olerance may be defined,s55,s57,the host to retain,tolerance,transplantation,transplantation 2009},
  pages = {S55-S57},
  title = {Mesenchymal Stem Cells in the Induction of Transplantation Tolerance.},
  volume = {87}
}

@misc{Bekiranov2009,
  author = {Bekiranov, Stefan},
  date = {2009-06-15},
  file = {/Users/ryan/Documents/Zotero Library/Bekiranov - 2009 - Introduction to ChIP-Seq Analysis using the SPP Pa.pdf},
  keywords = {presentation},
  langid = {english},
  note = {Pages: 1-8},
  title = {Introduction to {{ChIP}}-{{Seq Analysis}} Using the {{SPP Package}}}
}

@article{Beleites2013,
  abstract = {In biospectroscopy, suitably annotated and statistically independent samples (e.g. patients, batches, etc.) for classifier training and testing are scarce and costly. Learning curves show the model performance as function of the training sample size and can help to determine the sample size needed to train good classifiers. However, building a good model is actually not enough: the performance must also be proven. We discuss learning curves for typical small sample size situations with 5-25 independent samples per class. Although the classification models achieve acceptable performance, the learning curve can be completely masked by the random testing uncertainty due to the equally limited test sample size. In consequence, we determine test sample sizes necessary to achieve reasonable precision in the validation and find that 75-100 samples will usually be needed to test a good but not perfect classifier. Such a data set will then allow refined sample size planning on the basis of the achieved performance. We also demonstrate how to calculate necessary sample sizes in order to show the superiority of one classifier over another: this often requires hundreds of statistically independent test samples or is even theoretically impossible. We demonstrate our findings with a data set of ca. 2550 Raman spectra of single cells (five classes: erythrocytes, leukocytes and three tumour cell lines BT-20, MCF-7 and OCI-AML3) as well as by an extensive simulation that allows precise determination of the actual performance of the models in question.},
  author = {Beleites, Claudia and Neugebauer, Ute and Bocklitz, Thomas and Krafft, Christoph and Popp, J\"urgen},
  date = {2013-01-14},
  doi = {10/gf8kzx},
  eprint = {23265730},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Beleites et al. - 2013 - Sample size planning for classification models..pdf},
  issn = {1873-4324},
  issue = {June 2012},
  journaltitle = {Analytica chimica acta},
  keywords = {Cells; Cultured,Erythrocytes,Erythrocytes: chemistry,Erythrocytes: classification,Erythrocytes: cytology,Humans,Leukocytes,Leukocytes: chemistry,Leukocytes: classification,Leukocytes: cytology,MCF-7 Cells,Models; Theoretical,Sample Size,Spectrum Analysis; Raman},
  pages = {25-33},
  title = {Sample Size Planning for Classification Models.},
  volume = {760}
}

@article{Benjamini1995,
  author = {Benjamini, Y and Hochberg, Y},
  date = {1995},
  eprint = {10.2307/2346101},
  eprinttype = {jstor},
  journaltitle = {Journal of the Royal Statistical Society. Series B \ldots{}},
  keywords = {\#nosource,⛔ No DOI found},
  title = {Controlling the False Discovery Rate: A Practical and Powerful Approach to Multiple Testing}
}

@article{Benjamini1997,
  abstract = {ABSTRACT. In this paper we offer a multiplicity of approaches and procedures for multiple testing problems with weights. Some rationale for incorporating weights in multiple hypotheses testing are discussed. Various type-I error-rates and different possible ... \textbackslash{}n},
  author = {Benjamini, Yoav and Hochberg, Yosef},
  date = {1997},
  doi = {10/btjgsv},
  file = {/Users/ryan/Documents/Zotero Library/Benjamini and Hochberg - 1997 - Multiple Hypotheses Testing with Weights.pdf},
  isbn = {0303-6898},
  issn = {03036898, 14679469},
  journaltitle = {Scandinavian Journal of Statistics},
  keywords = {control weights,false discovery rate,family-wise error-rate,p -values,per-family error-rate,procedural weights},
  number = {3},
  pages = {407-418},
  title = {Multiple {{Hypotheses Testing}} with {{Weights}}},
  volume = {24}
}

@article{Benjamini2016,
  abstract = {Scientists use high-dimensional measurement assays to detect and prioritize regions of strong signal in a spatially organized domain. Examples include finding methylation enriched genomic regions using microarrays and identifying active cortical areas using brain-imaging. The most common procedure for detecting potential regions is to group together neighboring sites where the signal passed a threshold. However, one needs to account for the selection bias induced by this opportunistic procedure to avoid diminishing e{$\carriagereturn$}ects when generalizing to a population. In this paper, we present a model and a method that permit population inference for these de-tected regions. In particular, we provide non-asymptotic point and confidence interval estimates for mean e{$\carriagereturn$}ect in the region, which account for the local selection bias and the non-stationary covariance that is typical of these data. Such summaries allow researchers to better compare regions of di{$\carriagereturn$}erent sizes and di{$\carriagereturn$}erent correlation structures. Inference is provided within a conditional one-parameter exponential family for each region, with truncations that match the constraints of selection. A secondary screening-and-adjustment step allows pruning the set of detected regions, while controlling the false-coverage rate for the set of regions that are re-ported. We illustrate the benefits of the method by applying it to detected genomic regions with di{$\carriagereturn$}ering DNA-methylation rates across tissue types. Our method is shown to provide superior power compared to non-parametric approaches.},
  author = {Benjamini, Yuval and Taylor, Jonathan and Irizarry, Rafael A},
  date = {2019-07-03},
  doi = {10/ggcxjw},
  issn = {0162-1459},
  journaltitle = {Journal of the American Statistical Association},
  keywords = {\#nosource},
  number = {527},
  pages = {1351-1365},
  title = {Selection-{{Corrected Statistical Inference}} for {{Region Detection With High}}-{{Throughput Assays}}},
  volume = {114}
}

@article{Berard2002,
  abstract = {Mature T cells are produced in the thymus and released into the bloodstream in low numbers. These cells are considered to be immunologically na\i\textasciidieresis{}ve until such time as they encounter MHC-peptide complexes for which their T-cell receptors (TCR) have high affinity. Recognition of antigen in appropriate form, i.e. in association with costimulatory signals on the surface of professional antigen-presenting cells (APCs), leads to extensive T-cell proliferation and differentiation into effector cells. Once the infection has been cleared, it is no longer of benefit to the host to maintain high numbers of effector cells and most of the activated T cells die by apoptosis. However, a proportion of these cells survive, leaving the frequency of cells specific for the priming antigen much higher among memory T cells than that which existed among na\i\textasciidieresis{}ve T cells. This difference in frequency makes a major contribution to the nature of the secondary response, which is typically faster and of greater magnitude than the primary response. In addition, T cells may also carry a true `memory' of a prior response to antigen, exhibiting differences from na\i\textasciidieresis{}ve T cells at the single cell level. Here we provide a brief overview of the qualitative differences that have been reported to exist between na\i\textasciidieresis{}ve and memory T cells and evidence that memory T cells themselves are functionally heterogeneous. PHENOTYPIC DIFFERENCES BETWEEN NAI \textasciidieresis{}VE AND MEMORY T CELLS The supposition that na\i\textasciidieresis{}ve and memory T cells can be distinguished phenotypically is based on the notion that memory T cells retain a permanent imprint of having responded to antigen. Precise identification of memory T cells, however, remains problematic. Unlike B cells, T cells do not appear to mutate their antigen receptor genes during the course of an immune response. Furthermore, discrimi- nation between effector and memory T cells is accomplished Received 2 April 2002; accepted 17 April 2002. Correspondence: David F. Tough, The Edward Jenner Institute for Vaccine Research, Compton, Newbury, Berkshire RG20 7NN, UK. E-mail: david.tough@jenner.ac.uk on},
  author = {Berard, Marion and Tough, David F.},
  date = {2002},
  doi = {10/fc8shc},
  file = {/Users/ryan/Documents/Zotero Library/Berard and Tough - 2002 - Qualitative differences between naïve and memory T.pdf},
  issn = {00192805},
  journaltitle = {Immunology},
  number = {2},
  pages = {127-138},
  title = {Qualitative Differences between Na\"ive and Memory {{T}} Cells},
  volume = {106}
}

@article{Berest2018,
  abstract = {Transcription factor (TF) activity constitutes an important readout of cellular signalling pathways and thus for assessing regulatory differences across conditions. However, current technologies lack the ability to simultaneously assess activity changes for multiple TFs and in particular to determine whether a specific TF acts as repressor or activator. To this end, we introduce a widely applicable genome-wide method diffTF to assess differential TF binding activity and classifying TFs as activator or repressor by integrating any type of genome-wide chromatin with RNA-Seq data and in-silico predicted TF binding sites (available at https://git.embl.de/grp-zaugg/diffTF). We apply diffTF to a large ATAC-Seq dataset of mutated and unmutated chronic lymphocytic leukemia and identify dozens of TFs that are differentially active. Around 40\% of them have a previously described association with CLL while \textasciitilde{}60\% constitute potentially novel TFs driving the different CLL subtypes. Finally, we validated the method experimentally using the well studied system of hematopoietic differentiation in mouse.},
  author = {Berest, Ivan and Arnold, Christian and Reyes-Palomares, Armando and Palla, Giovanni and Rasmussen, Kasper Dindler and Helin, Kristian and Zaugg, Judith B.},
  date = {2018},
  doi = {10/ggcxjx},
  file = {/Users/ryan/Documents/Zotero Library/Berest et al. - 2018 - Quantification of differential transcription facto.pdf},
  journaltitle = {bioRxiv},
  pages = {368498},
  title = {Quantification of Differential Transcription Factor Activity and Multiomic-Based Classification into Activators and Repressors: {{diffTF Novo Nordisk Foundation Center}} for {{Stem Cell Biology}}, {{Copenhagen}} * Equal Contribution}
}

@article{Berge2017,
  author = {Berge, Koen Van Den and Soneson, Charlotte and Robinson, Mark D and Clement, Lieven},
  date = {2017},
  file = {/Users/ryan/Documents/Zotero Library/Berge et al. - 2017 - A general and powerful stage-wise testing procedur.pdf},
  keywords = {differential expression,differential transcript usage,rna-sequencing,stage-wise testing},
  pages = {1-14},
  title = {A General and Powerful Stage-Wise Testing Procedure for Differential Expression and Differential Transcript Usage}
}

@article{Berglund2017,
  abstract = {Background: Autologous and allogeneic adult mesenchymal stem/stromal cells (MSCs) are increasingly being investigated for treating a wide range of clinical diseases. Allogeneic MSCs are especially attractive due to their potential to provide immediate care at the time of tissue injury or disease diagnosis. The prevailing dogma has been that allogeneic MSCs are immune privileged, but there have been very few studies that control for matched or mismatched major histocompatibility complex (MHC) molecule expression and that examine immunogenicity in vivo. Studies that control for MHC expression have reported both cell-mediated and humoral immune responses to MHC-mismatched MSCs. The clinical implications of immune responses to MHC-mismatched MSCs are still unknown. Pre-clinical and clinical studies that document the MHC haplotype of donors and recipients and measure immune responses following MSC treatment are necessary to answer this critical question. Conclusions: This review details what is currently known about the immunogenicity of allogeneic MSCs and suggests contemporary assays that could be utilized in future studies to appropriately identify and measure immune responses to MHC-mismatched MSCs.},
  author = {Berglund, Alix K. and Fortier, Lisa A. and Antczak, Douglas F. and Schnabel, Lauren V.},
  date = {2017-12-22},
  doi = {10/ggcxjz},
  file = {/Users/ryan/Documents/Zotero Library/Berglund et al. - 2017 - Immunoprivileged no more Measuring the immunogeni.pdf},
  issn = {17576512},
  journaltitle = {Stem Cell Research and Therapy},
  keywords = {Allogeneic,Cytotoxicity,ELISPOT,Immunogenicity,Major histocompatibility complex,Mesenchymal stem cell,Microcytotoxicity,Mixed leukocyte reaction},
  number = {1},
  pages = {288},
  title = {Immunoprivileged No More: {{Measuring}} the Immunogenicity of Allogeneic Adult Mesenchymal Stem Cells},
  volume = {8}
}

@article{Berman2010,
  abstract = {OBJECTIVE - To test the graft-promoting effects of mesenchymal stem cells (MSCs) in a cynomolgus monkey model of islet/bone marrow transplantation. RESEARCH DESIGN AND METHODS - Cynomolgus MSCs were obtained from iliac crest aspirate and characterized through passage 11 for phenotype, gene expression, differentiation potential, and karyotype. Allogeneic donor MSCs were cotransplanted intraportally with islets on postoperative day (POD) 0 and intravenously with donor marrow on PODs 5 and 11. Recipients were followed for stabilization of blood glucose levels, reduction of exogenous insulin requirement (EIR), C-peptide levels, changes in peripheral blood T regulatory cells, and chimerism. Destabilization of glycemia and increases in EIR were used as signs of rejection; additional intravenous MSCs were administered to test the effect on reversal of rejection. RESULTS - MSC phenotype and a normal karyotype were observed through passage 11. IL-6, IL-10, vascular endothelial growth factor, TGF-{$\beta$}, hepatocyte growth factor, and galectin-1 gene expression levels varied among donors. MSC treatment significantly enhanced islet engraftment and function at 1 month posttransplant (n = 8), as compared with animals that received islets without MSCs (n = 3). Additional infusions of donor or third-party MSCs resulted in reversal of rejection episodes and prolongation of islet function in two animals. Stable islet allograft function was associated with increased numbers of regulatory T-cells in peripheral blood. CONCLUSIONS - MSCs may provide an important approach for enhancement of islet engraftment, thereby decreasing the numbers of islets needed to achieve insulin independence. Furthermore, MSCs may serve as a new, safe, and effective antirejection therapy. \textcopyright{} 2010 by the American Diabetes Association.},
  author = {Berman, Dora M. and Willman, Melissa A. and Han, Dongmei and Kleiner, Gary and Kenyon, Norman M. and Cabrera, Over and Karl, Julie A. and Wiseman, Roger W. and O'Connor, David H. and Bartholomew, Amelia M. and Kenyon, Norma S.},
  date = {2010},
  doi = {10/c9r6nn},
  file = {/Users/ryan/Documents/Zotero Library/Berman et al. - 2010 - Mesenchymal stem cells enhance allogeneic islet en.pdf},
  issn = {00121797},
  journaltitle = {Diabetes},
  number = {10},
  pages = {2558-2568},
  title = {Mesenchymal Stem Cells Enhance Allogeneic Islet Engraftment in Nonhuman Primates},
  volume = {59}
}

@article{Bi2013,
  abstract = {BACKGROUND: RNA-seq, a massive parallel-sequencing-based transcriptome profiling method, provides digital data in the form of aligned sequence read counts. The comparative analyses of the data require appropriate statistical methods to estimate the differential expression of transcript variants across different cell/tissue types and disease conditions.

RESULTS: We developed a novel nonparametric empirical Bayesian-based approach (NPEBseq) to model the RNA-seq data. The prior distribution of the Bayesian model is empirically estimated from the data without any parametric assumption, and hence the method is "nonparametric" in nature. Based on this model, we proposed a method for detecting differentially expressed genes across different conditions. We also extended this method to detect differential usage of exons from RNA-seq data. The evaluation of NPEBseq on both simulated and publicly available RNA-seq datasets and comparison with three popular methods showed improved results for experiments with or without biological replicates.

CONCLUSIONS: NPEBseq can successfully detect differential expression between different conditions not only at gene level but also at exon level from RNA-seq datasets. In addition, NPEBSeq performs significantly better than current methods and can be applied to genome-wide RNA-seq datasets. Sample datasets and R package are available at http://bioinformatics.wistar.upenn.edu/NPEBseq.},
  author = {Bi, Yingtao and Davuluri, Ramana V},
  date = {2013-08-27},
  doi = {10/gb8vvz},
  eprint = {23981227},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Bi and Davuluri - 2013 - NPEBseq nonparametric empirical bayesian-based pr.pdf},
  issn = {1471-2105},
  journaltitle = {BMC bioinformatics},
  number = {1},
  pages = {262},
  title = {{{NPEBseq}}: Nonparametric Empirical Bayesian-Based Procedure for Differential Expression Analysis of {{RNA}}-Seq Data.},
  volume = {14}
}

@report{Bischl2012,
  abstract = {Empirical analysis of statistical algorithms often demands time-consuming ex- periments which are best performed on high performance computing clusters. We present two R packages which greatly simplify working in batch computing envi- ronments. The package BatchJobs implements the basic objects and procedures to control a batch cluster within R . It is structured around cluster versions of the well-known higher order functions Map , Reduce and Filter from functional programming. An important feature is that the state of computation is persistently available in a database. The user can query the status of jobs and then continue working with a desired subset. The second package, BatchExperiments , is tailored for the still very general sce- nario of analyzing arbitrary algorithms on problem instances. It extends BatchJobs by letting the user define an array of jobs of the kind ``apply algorithm A to prob- lem instance P and store results''. It is possible to associate statistical designs with parameters of algorithms and problems and therefore to systematically study their influence on the results. In general our main contributions are: (a) Portability : Both packages use a clear and well-defined interface to the batch system which makes them applicable in most high-performance computing environments. (b) Reproducibility : Every computational part has an associated seed that the user can control to ensure reproducibility even when the underlying batch system changes. (c) Efficiency : Efficiently use batch computing clusters completely within R . (d) Abstraction and good software design : The code layers for algorithms, experiment definitions and execution are cleanly separated and enable the writing of readable and maintainable code.},
  author = {Bischl, Bernd and Lang, Michel},
  date = {2012},
  file = {/Users/ryan/Documents/Zotero Library/Bischl and Lang - 2012 - Computing on high performance clusters with R Pac.pdf},
  institution = {{technische universit\"at dortmund}},
  title = {Computing on High Performance Clusters with {{R}}: {{Packages BatchJobs}} and {{BatchExperiments}}}
}

@article{blanchetteAligningMultipleGenomic2004,
  abstract = {We define a ``threaded blockset,'' which is a novel generalization of the classic notion of a multiple alignment. A new computer program called TBA (for ``threaded blockset aligner'') builds a threaded blockset under the assumption that all matching segments occur in the same order and orientation in the given sequences; inversions and duplications are not addressed. TBA is designed to be appropriate for aligning many, but by no means all, megabase-sized regions of multiple mammalian genomes. The output of TBA can be projected onto any genome chosen as a reference, thus guaranteeing that different projections present consistent predictions of which genomic positions are orthologous. This capability is illustrated using a new visualization tool to view TBA-generated alignments of vertebrate Hox clusters from both the mammalian and fish perspectives. Experimental evaluation of alignment quality, using a program that simulates evolutionary change in genomic sequences, indicates that TBA is more accurate than earlier programs. To perform the dynamic-programming alignment step, TBA runs a stand-alone program called MULTIZ, which can be used to align highly rearranged or incompletely sequenced genomes. We describe our use of MULTIZ to produce the whole-genome multiple alignments at the Santa Cruz Genome Browser.},
  author = {Blanchette, Mathieu and Kent, W. James and Riemer, Cathy and Elnitski, Laura and Smit, Arian F. A. and Roskin, Krishna M. and Baertsch, Robert and Rosenbloom, Kate and Clawson, Hiram and Green, Eric D. and Haussler, David and Miller, Webb},
  date = {2004-01-04},
  doi = {10/d79h2w},
  eprint = {15060014},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Blanchette et al. - 2004 - Aligning Multiple Genomic Sequences With the Threa.pdf;/Users/ryan/Zotero/storage/B94BUFKK/708.html},
  issn = {1088-9051, 1549-5469},
  journaltitle = {Genome Research},
  langid = {english},
  number = {4},
  pages = {708-715},
  shortjournal = {Genome Res.},
  title = {Aligning {{Multiple Genomic Sequences With}} the {{Threaded Blockset Aligner}}},
  volume = {14}
}

@article{Blanco2007,
  abstract = {This unit describes the usage of geneid, an efficient gene-finding program that allows for the analysis of large genomic sequences, including whole mammalian chromosomes. These sequences can be partially annotated, and geneid can be used to refine this initial annotation. Training geneid is relatively easy, and parameter configurations exist for a number of eukaryotic species. Geneid produces output in a variety of standard formats. The results, thus, can be processed by a variety of software tools, including visualization programs. Geneid software is in the public domain, and it is undergoing constant development. It is easy to install and use. Exhaustive benchmark evaluations show that geneid compares favorably with other existing gene finding tools.},
  author = {Blanco, Enrique and Parra, Gen\'is and Guig\'o, Roderic},
  date = {2007-06},
  doi = {10/b54cc7},
  eprint = {18428791},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Blanco et al. - 2007 - Using geneid to identify genes..pdf},
  issn = {1934-340X},
  journaltitle = {Current protocols in bioinformatics / editoral board, Andreas D. Baxevanis ... [et al.]},
  keywords = {Algorithms,Base Sequence,Chromosome Mapping,Chromosome Mapping: methods,DNA,DNA: methods,Genes,Genes: genetics,Molecular Sequence Data,Sequence Alignment,Sequence Alignment: methods,Sequence Analysis},
  number = {1},
  pages = {Unit 4.3},
  title = {Using Geneid to Identify Genes.},
  volume = {Chapter 4}
}

@article{Blume2018,
  abstract = {Verifying that a statistically significant result is scientifically meaningful is not only good scientific practice, it is a natural way to control the Type I error rate. Here we introduce a novel extension of the p-value\textemdash{}a second-generation p-value (p{$\delta$})\textendash{}that formally accounts for scientific relevance and leverages this natural Type I Error control. The approach relies on a pre-specified interval null hypothesis that represents the collection of effect sizes that are scientifically uninteresting or are practically null. The second-generation p-value is the proportion of data-supported hypotheses that are also null hypotheses. As such, second-generation p-values indicate when the data are compatible with null hypotheses (p{$\delta$} = 1), or with alternative hypotheses (p{$\delta$} = 0), or when the data are inconclusive (0 {$<$} p{$\delta$} {$<$} 1). Moreover, second-generation p-values provide a proper scientific adjustment for multiple comparisons and reduce false discovery rates. This is an advance for environments rich in data, where traditional p-value adjustments are needlessly punitive. Second-generation p-values promote transparency, rigor and reproducibility of scientific results by a priori specifying which candidate hypotheses are practically meaningful and by providing a more reliable statistical summary of when the data are compatible with alternative or null hypotheses.},
  archivePrefix = {arXiv},
  author = {Blume, Jeffrey D. and D'Agostino McGowan, Lucy and Dupont, William D. and Greevy, Robert A.},
  date = {2018-03-22},
  doi = {10/gc7575},
  editor = {Smalheiser, Neil R.},
  eprint = {1709.09333},
  eprinttype = {arxiv},
  file = {/Users/ryan/Documents/Zotero Library/Blume et al. - 2018 - Second-generation p-values Improved rigor, reprod.pdf},
  isbn = {1111111111},
  issn = {19326203},
  journaltitle = {PLoS ONE},
  number = {3},
  pages = {e0188299},
  title = {Second-Generation p-Values: {{Improved}} Rigor, Reproducibility, \& Transparency in Statistical Analyses},
  volume = {13}
}

@article{Boley2014a,
  abstract = {The identification of full length transcripts entirely from short-read RNA sequencing data (RNA-seq) remains a challenge in the annotation of genomes. Here we describe an automated pipeline for genome annotation that integrates RNA-seq and gene-boundary data sets, which we call Generalized RNA Integration Tool, or GRIT. Applying GRIT to Drosophila melanogaster short-read RNA-seq, cap analysis of gene expression (CAGE) and poly(A)-site-seq data collected for the modENCODE project, we recovered the vast majority of previously annotated transcripts and doubled the total number of transcripts cataloged. We found that 20\% of protein coding genes encode multiple protein-localization signals and that, in 20-d-old adult fly heads, genes with multiple polyadenylation sites are more common than genes with alternative splicing or alternative promoters. GRIT demonstrates 30\% higher precision and recall than the most widely used transcript assembly tools. GRIT will facilitate the automated generation of high-quality genome annotations without the need for extensive manual annotation.},
  author = {Boley, Nathan and Stoiber, Marcus H and Booth, Benjamin W and Wan, Kenneth H and a Hoskins, Roger and Bickel, Peter J and Celniker, Susan E and Brown, James B},
  date = {2014-03-16},
  doi = {10/f5zgdb},
  eprint = {24633242},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Boley et al. - 2014 - Genome-guided transcript assembly by integrative a.pdf},
  issn = {1546-1696},
  journaltitle = {Nature biotechnology},
  title = {Genome-Guided Transcript Assembly by Integrative Analysis of {{RNA}} Sequence Data.}
}

@software{boleyIrreproducibleDiscoveryRate2019,
  author = {Boley, Nathan},
  date = {2019-10-15T20:58:43Z},
  ids = {gh-idr},
  keywords = {⛔ No DOI found},
  origdate = {2015-01-22T18:57:07Z},
  title = {Irreproducible Discovery Rate ({{IDR}})},
  url = {https://github.com/nboley/idr},
  urldate = {2019-11-14}
}

@article{Bolstad2003,
  abstract = {MOTIVATION: When running experiments that involve multiple high density oligonucleotide arrays, it is important to remove sources of variation between arrays of non-biological origin. Normalization is a process for reducing this variation. It is common to see non-linear relations between arrays and the standard normalization provided by Affymetrix does not perform well in these situations.

RESULTS: We present three methods of performing normalization at the probe intensity level. These methods are called complete data methods because they make use of data from all arrays in an experiment to form the normalizing relation. These algorithms are compared to two methods that make use of a baseline array: a one number scaling based algorithm and a method that uses a non-linear normalizing relation by comparing the variability and bias of an expression measure. Two publicly available datasets are used to carry out the comparisons. The simplest and quickest complete data method is found to perform favorably.

AVAILABILITY: Software implementing all three of the complete data normalization methods is available as part of the R package Affy, which is a part of the Bioconductor project http://www.bioconductor.org.

SUPPLEMENTARY INFORMATION: Additional figures may be found at http://www.stat.berkeley.edu/\textasciitilde{}bolstad/normalize/index.html},
  author = {Bolstad, B M and a Irizarry, R and Astrand, M and Speed, T P},
  date = {2003-01-22},
  eprint = {12538238},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Bolstad et al. - 2003 - A comparison of normalization methods for high den.pdf},
  issn = {1367-4803},
  journaltitle = {Bioinformatics (Oxford, England)},
  keywords = {Algorithms,Calibration,Models; Genetic,Molecular Probes,Nonlinear Dynamics,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: instrumen,Oligonucleotide Array Sequence Analysis: methods,Oligonucleotide Array Sequence Analysis: standards,Quality Control,Sequence Analysis; DNA,Sequence Analysis; DNA: methods,Sequence Analysis; DNA: standards,Stochastic Processes},
  number = {2},
  pages = {185-93},
  title = {A Comparison of Normalization Methods for High Density Oligonucleotide Array Data Based on Variance and Bias.},
  volume = {19}
}

@article{Bonafede2014,
  archivePrefix = {arXiv},
  author = {Bonafede, Elisabetta and Picard, Franck and Viroli, Cinzia and Sciences, Statistical and Evolutive, Biologie and Cnrs, U M R and November, France},
  date = {2014},
  doi = {10/f846sd},
  eprint = {1410.8093v2},
  eprinttype = {arxiv},
  file = {/Users/ryan/Documents/Zotero Library/Bonafede et al. - 2014 - Modelling overdispersion heterogeneity in differen.pdf},
  issn = {0006341X},
  keywords = {hypothesis testing,mixture models,rna-seq data},
  pages = {1-22},
  title = {Modelling Overdispersion Heterogeneity in Differential Expression Analysis Using Mixtures}
}

@article{Boos2011,
  abstract = {Cyclin-dependent kinases (CDKs) play crucial roles in promoting DNA replication and preventing rereplication in eukaryotic cells [1-4]. In budding yeast, CDKs promote DNA replication by phosphorylating two proteins, Sld2 and Sld3, which generates binding sites for pairs of BRCT repeats (breast cancer gene 1 [BRCA1] C terminal repeats) in the Dpb11 protein [5, 6]. The Sld3-Dpb11-Sld2 complex generated by CDK phosphorylation is required for the assembly and activation of the Cdc45-Mcm2-7-GINS (CMG) replicative helicase. In response to DNA replication stress, the interaction between Sld3 and Dpb11 is blocked by the checkpoint kinase Rad53 [7], which prevents late origin firing [7, 8]. Here we show that the two key CDK sites in Sld3 are conserved in the human Sld3-related protein Treslin/ticrr and are essential for DNA replication. Moreover, phosphorylation of these two sites mediates interaction with the orthologous pair of BRCT repeats in the human Dpb11 ortholog, TopBP1. Finally, we show that DNA replication stress prevents the interaction between Treslin/ticrr and TopBP1 via the Chk1 checkpoint kinase. Our results indicate that Treslin/ticrr is a genuine ortholog of Sld3 and that the Sld3-Dpb11 interaction has remained a critical nexus of S phase regulation through eukaryotic evolution.},
  author = {Boos, Dominik and Sanchez-Pulido, Luis and Rappas, Mathieu and Pearl, Laurence H and Oliver, Antony W and Ponting, Chris P and Diffley, John F X},
  date = {2011-07-12},
  doi = {10/d754zj},
  eprint = {21700459},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Boos et al. - 2011 - Regulation of DNA replication through Sld3-Dpb11 i.pdf},
  issn = {1879-0445},
  journaltitle = {Current biology : CB},
  keywords = {Amino Acid Sequence,Cell Cycle Proteins,Cell Cycle Proteins: chemistry,Cell Cycle Proteins: metabolism,Cell Cycle Proteins: physiology,Conserved Sequence,Cyclin-Dependent Kinases,Cyclin-Dependent Kinases: chemistry,Cyclin-Dependent Kinases: physiology,DNA Replication,DNA Replication: physiology,Evolution; Molecular,Fungal Proteins,Fungal Proteins: chemistry,Fungal Proteins: metabolism,Fungal Proteins: physiology,HeLa Cells,Humans,Molecular Sequence Data,Protein Kinases,Protein Kinases: metabolism,Protein Kinases: physiology,Saccharomyces cerevisiae Proteins,Saccharomyces cerevisiae Proteins: chemistry,Saccharomyces cerevisiae Proteins: metabolism,Saccharomyces cerevisiae Proteins: physiology,Sequence Alignment,Yeasts,Yeasts: genetics},
  number = {13},
  pages = {1152-7},
  title = {Regulation of {{DNA}} Replication through {{Sld3}}-{{Dpb11}} Interaction Is Conserved from Yeast to Humans.},
  volume = {21}
}

@article{Bourgon2010,
  abstract = {With high-dimensional data, variable-by-variable statistical testing is often used to select variables whose behavior differs across conditions. Such an approach requires adjustment for multiple testing, which can result in low statistical power. A two-stage approach that first filters variables by a criterion independent of the test statistic, and then only tests variables which pass the filter, can provide higher power. We show that use of some filter/test statistics pairs presented in the literature may, however, lead to loss of type I error control. We describe other pairs which avoid this problem. In an application to microarray data, we found that gene-by-gene filtering by overall variance followed by a t-test increased the number of discoveries by 50\%. We also show that this particular statistic pair induces a lower bound on fold-change among the set of discoveries. Independent filtering-using filter/test pairs that are independent under the null hypothesis but correlated under the alternative-is a general approach that can substantially increase the efficiency of experiments.},
  author = {Bourgon, Richard and Gentleman, Robert and Huber, Wolfgang},
  date = {2010-05-25},
  doi = {10/b94qj2},
  eprint = {20460310},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Bourgon et al. - 2010 - Independent filtering increases detection power fo.pdf},
  issn = {1091-6490},
  journaltitle = {Proceedings of the National Academy of Sciences of the United States of America},
  keywords = {Algorithms,Biometry,Biometry: methods,Computational Biology,Genetic,Models},
  number = {21},
  pages = {9546-51},
  title = {Independent Filtering Increases Detection Power for High-Throughput Experiments.},
  volume = {107}
}

@article{Boyle2008,
  abstract = {Mapping DNase I hypersensitive (HS) sites is an accurate method of identifying the location of genetic regulatory elements, including promoters, enhancers, silencers, insulators, and locus control regions. We employed high-throughput sequencing and whole-genome tiled array strategies to identify DNase I HS sites within human primary CD4+ T cells. Combining these two technologies, we have created a comprehensive and accurate genome-wide open chromatin map. Surprisingly, only 16\%-21\% of the identified 94,925 DNase I HS sites are found in promoters or first exons of known genes, but nearly half of the most open sites are in these regions. In conjunction with expression, motif, and chromatin immunoprecipitation data, we find evidence of cell-type-specific characteristics, including the ability to identify transcription start sites and locations of different chromatin marks utilized in these cells. In addition, and unexpectedly, our analyses have uncovered detailed features of nucleosome structure.},
  author = {Boyle, Alan P and Davis, Sean and Shulha, Hennady P and Meltzer, Paul and Margulies, Elliott H and Weng, Zhiping and Furey, Terrence S and Crawford, Gregory E},
  date = {2008-01-25},
  doi = {10/fbcrk6},
  eprint = {18243105},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Boyle et al. - 2008 - High-resolution mapping and characterization of op.pdf},
  issn = {1097-4172},
  journaltitle = {Cell},
  keywords = {Algorithms,Area Under Curve,Binding Sites,CD4-Positive T-Lymphocytes,CD4-Positive T-Lymphocytes: cytology,Cell Nucleus,Cell Nucleus: metabolism,Chromatin,Chromatin Immunoprecipitation,Chromatin: genetics,Chromosome Mapping,Chromosome Mapping: methods,Chromosomes; Human,Deoxyribonuclease I,Deoxyribonuclease I: chemistry,Deoxyribonuclease I: pharmacology,Genome; Human,Genome; Human: genetics,Genome; Human: immunology,Histones,Histones: chemistry,Humans,Nucleosomes,Nucleosomes: chemistry,Oligonucleotide Array Sequence Analysis,Promoter Regions; Genetic,ROC Curve,Sensitivity and Specificity,Sequence Analysis; DNA,Transcription Factors,Transcription Factors: metabolism},
  number = {2},
  pages = {311-22},
  title = {High-Resolution Mapping and Characterization of Open Chromatin across the Genome.},
  volume = {132}
}

@article{Bray2016,
  archivePrefix = {arXiv},
  author = {Bray, Nicolas L and Pimentel, Harold and Melsted, P\'all and Pachter, Lior},
  date = {2016},
  doi = {10/f8nvsp},
  eprint = {27043002},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Bray et al. - 2016 - Near-optimal probabilistic RNA-seq quantification.pdf},
  isbn = {1546-1696 (Electronic) 1087-0156 (Linking)},
  issn = {1087-0156},
  journaltitle = {Nature Biotechnology},
  number = {5},
  pages = {525-527},
  title = {Near-Optimal Probabilistic {{RNA}}-Seq Quantification},
  volume = {34}
}

@article{Breese2013,
  abstract = {SUMMARY: NGSUtils is a suite of software tools for manipulating data common to next-generation sequencing experiments, such as FASTQ, BED and BAM format files. These tools provide a stable and modular platform for data management and analysis.Availability and implementation: NGSUtils is available under a BSD license and works on Mac OS X and Linux systems. Python 2.6+ and virtualenv are required. More information and source code may be obtained from the website: http://ngsutils.org. CONTACT: yunliu@iupui.eduSupplemental information: Supplementary data are available at Bioinformatics online.},
  author = {Breese, Marcus R and Liu, Yunlong},
  date = {2013-01-21},
  doi = {10/ggcxj2},
  eprint = {23314324},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Breese and Liu - 2013 - NGSUtils a software suite for analyzing and manip.pdf},
  issn = {1367-4811},
  journaltitle = {Bioinformatics (Oxford, England)},
  number = {4},
  pages = {494-496},
  title = {{{NGSUtils}}: A Software Suite for Analyzing and Manipulating next-Generation Sequencing Datasets.},
  volume = {29}
}

@article{Bresler2012,
  author = {Bresler, M. and Sheehan, S. and Chan, a. H. and Song, Y. S.},
  date = {2012-09-07},
  doi = {10/f38qfp},
  file = {/Users/ryan/Documents/Zotero Library/Bresler et al. - 2012 - Telescoper de novo assembly of highly repetitive .pdf},
  issn = {1367-4803},
  journaltitle = {Bioinformatics},
  number = {18},
  pages = {i311-i317},
  title = {Telescoper: De Novo Assembly of Highly Repetitive Regions},
  volume = {28}
}

@article{Caplan2017,
  abstract = {Mesenchymal stem cells (MSCs) were officially named more than 25 years ago to represent a class of cells from human and mammalian bone marrow and periosteum that could be isolated and expanded in culture while maintaining their in vitro capacity to be induced to form a variety of mesodermal phenotypes and tissues. The in vitro capacity to form bone, cartilage, fat, etc., became an assay for identifying this class of multipotent cells and around which several companies were formed in the 1990s to medically exploit the regenerative capabilities of MSCs. Today, there are hundreds of clinics and hundreds of clinical trials using human MSCs with very few, if any, focusing on the in vitro multipotential capacities of these cells. Unfortunately, the fact that MSCs are called ``stem cells'' is being used to infer that patients will receive direct medical benefit, because they imagine that these cells will differentiate into regenerating tissueproducing cells. Such a stem cell treatment will presumably cure the patient of their medically relevant difficulties ranging from osteoarthritic (bone-on-bone) knees to various neurological maladies including dementia. I now urge that we change the name of MSCs to Medicinal Signaling Cells to more accurately reflect the fact that these cells home in on sites of injury or disease and secrete bioactive factors that are immunomodulatory and trophic (regenerative) meaning that these cells make therapeutic drugs in situ that are medicinal. It is, indeed, the patient's own site-specific and tissue-specific resident stem cells that construct the new tissue as stimulated by the bioactive factors secreted by the exogenously supplied MSCs.},
  author = {Caplan, Arnold I.},
  date = {2017-06},
  doi = {10/ggcxj3},
  file = {/Users/ryan/Documents/Zotero Library/Caplan - 2017 - Mesenchymal stem cells Time to change the name!.pdf},
  issn = {21576580},
  journaltitle = {Stem Cells Translational Medicine},
  keywords = {Medicinal signaling cells,Mesenchymal stem cells,MSCs,Regenerative medicine},
  number = {6},
  pages = {1445-1451},
  title = {Mesenchymal Stem Cells: {{Time}} to Change the Name!},
  volume = {6}
}

@book{Carlson2013,
  author = {Carlson, Marc and Obenchain, Valerie and Pag\`es, Herv\'e and Shannon, Paul and Tenenbaum, Dan and Morgan, Martin},
  date = {2013-05-28},
  file = {/Users/ryan/Documents/Zotero Library/Carlson et al. - 2013 - Intermediate R  Bioconductor for Sequence Analysi.pdf},
  keywords = {⛔ No DOI found},
  title = {Intermediate {{R}} / {{Bioconductor}} for {{Sequence Analysis}}}
}

@article{Castellana2008,
  abstract = {Gene annotation underpins genome science. Most often protein coding sequence is inferred from the genome based on transcript evidence and computational predictions. While generally correct, gene models suffer from errors in reading frame, exon border definition, and exon identification. To ascertain the error rate of Arabidopsis thaliana gene models, we isolated proteins from a sample of Arabidopsis tissues and determined the amino acid sequences of 144,079 distinct peptides by tandem mass spectrometry. The peptides corresponded to 1 or more of 3 different translations of the genome: a 6-frame translation, an exon splice-graph, and the currently annotated proteome. The majority of the peptides (126,055) resided in existing gene models (12,769 confirmed proteins), comprising 40\% of annotated genes. Surprisingly, 18,024 novel peptides were found that do not correspond to annotated genes. Using the gene finding program AUGUSTUS and 5,426 novel peptides that occurred in clusters, we discovered 778 new protein-coding genes and refined the annotation of an additional 695 gene models. The remaining 13,449 novel peptides provide high quality annotation ({$>$}99\% correct) for thousands of additional genes. Our observation that 18,024 of 144,079 peptides did not match current gene models suggests that 13\% of the Arabidopsis proteome was incomplete due to approximately equal numbers of missing and incorrect gene models.},
  author = {Castellana, Natalie E and Payne, Samuel H and Shen, Zhouxin and Stanke, Mario and Bafna, Vineet and Briggs, Steven P},
  date = {2008-12-30},
  doi = {10/fpqs6c},
  eprint = {19098097},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Castellana et al. - 2008 - Discovery and revision of Arabidopsis genes by pro.pdf},
  issn = {1091-6490},
  journaltitle = {Proceedings of the National Academy of Sciences of the United States of America},
  keywords = {Arabidopsis,Arabidopsis Proteins,Arabidopsis Proteins: genetics,Arabidopsis: genetics,Genome; Plant,Genome; Plant: genetics,Models; Genetic,Proteome,Proteome: genetics,Proteomics,Proteomics: methods,Software},
  number = {52},
  pages = {21034-8},
  title = {Discovery and Revision of {{Arabidopsis}} Genes by Proteogenomics.},
  volume = {105}
}

@article{Caviston2011,
  abstract = {Huntingtin (Htt) is a membrane-associated scaffolding protein that interacts with microtubule motors as well as actin-associated adaptor molecules. We examined a role for Htt in the dynein-mediated intracellular trafficking of endosomes and lysosomes. In HeLa cells depleted of either Htt or dynein, early, recycling, and late endosomes (LE)/lysosomes all become dispersed. Despite altered organelle localization, kinetic assays indicate only minor defects in intracellular trafficking. Expression of full-length Htt is required to restore organelle localization in Htt-depleted cells, supporting a role for Htt as a scaffold that promotes functional interactions along its length. In dynein-depleted cells, LE/lysosomes accumulate in tight patches near the cortex, apparently enmeshed by cortactin-positive actin filaments; Latrunculin B-treatment disperses these patches. Peripheral LE/lysosomes in dynein-depleted cells no longer colocalize with microtubules. Htt may be required for this off-loading, as the loss of microtubule association is not seen in Htt-depleted cells or in cells depleted of both dynein and Htt. Inhibition of kinesin-1 relocalizes peripheral LE/lysosomes induced by Htt depletion but not by dynein depletion, consistent with their detachment from microtubules upon dynein knockdown. Together, these data support a model of Htt as a facilitator of dynein-mediated trafficking that may regulate the cytoskeletal association of dynamic organelles.},
  author = {Caviston, Juliane P and Zajac, Allison L and Tokito, Mariko and Holzbaur, Erika L F},
  date = {2011-02-15},
  doi = {10/bhxdc9},
  eprint = {21169558},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Caviston et al. - 2011 - Huntingtin coordinates the dynein-mediated dynamic.pdf},
  issn = {1939-4586},
  journaltitle = {Molecular biology of the cell},
  keywords = {Actins,Actins: metabolism,Cell Line; Tumor,Cytoskeleton,Cytoskeleton: metabolism,Dyneins,Dyneins: genetics,Dyneins: metabolism,Endosomes,Endosomes: metabolism,Gene Knockdown Techniques,Gene Knockdown Techniques: methods,HeLa Cells,Humans,Lysosome-Associated Membrane Glycoproteins,Lysosome-Associated Membrane Glycoproteins: metabo,Lysosomes,Lysosomes: metabolism,Microtubule-Associated Proteins,Microtubule-Associated Proteins: metabolism,Microtubule-Associated Proteins: physiology,Microtubules,Microtubules: metabolism,Microtubules: physiology,Molecular Motor Proteins,Molecular Motor Proteins: genetics,Molecular Motor Proteins: metabolism,Nerve Tissue Proteins,Nerve Tissue Proteins: genetics,Nerve Tissue Proteins: metabolism,Nuclear Proteins,Nuclear Proteins: genetics,Nuclear Proteins: metabolism,Organelles,Organelles: metabolism,Polymerization,Protein Transport,Protein Transport: physiology,RNA Interference},
  number = {4},
  pages = {478-92},
  title = {Huntingtin Coordinates the Dynein-Mediated Dynamic Positioning of Endosomes and Lysosomes.},
  volume = {22}
}

@article{Chabbert2015,
  abstract = {\textcopyright{} 2015 The Authors. Published under the terms of the CC BY 4.0 license. We present a modified approach of chromatin immuno-precipitation followed by sequencing (ChIP-Seq), which relies on the direct ligation of molecular barcodes to chromatin fragments, thereby permitting experimental scale-up. With Bar-ChIP now enabling the concurrent profiling of multiple DNA-protein interactions, we report the simultaneous generation of 90 ChIP-Seq datasets without any robotic instrumentation. We demonstrate that application of Bar-ChIP to a panel of Saccharomyces cerevisiae chromatin-associated mutants provides a rapid and accurate genome-wide overview of their chromatin status. Additionally, we validate the utility of this technology to derive novel biological insights by identifying a role for the Rpd3S complex in maintaining H3K14 hypo-acetylation in gene bodies. We also report an association between the presence of intragenic H3K4 tri-methylation and the emergence of cryptic transcription in a Set2 mutant. Finally, we uncover a crosstalk between H3K14 acetylation and H3K4 methylation in this mutant. These results show that Bar-ChIP enables biological discovery through rapid chromatin profiling at single-nucleosome resolution for various conditions and protein modifications at once. Synopsis A new approach provides a rapid and accurate genome-wide overview of the chromatin status of multiple yeast chromatin-associated mutants at once. The simultaneous profiling of epigenetic marks in the mutants is achieved by multiplex immuno-precipitation of barcoded chromatin samples. Bar-ChIP is based on the immuno-precipitation of barcoded chromatin and permits sample multiplexing, thereby increasing the throughput of ChIP-Seq experiments. Application of the method to yeast chromatin-associated mutants enabled the concurrent generation of 90 ChIP-Seq datasets without the need for robotic instrumentation. The rapid chromatin profiling of the mutants at single-nucleosome resolution uncovered an association between intragenic H3K4 tri-methylation and cryptic transcription in set2{$\Pi$}. A new approach provides a rapid and accurate genome-wide overview of the chromatin status of multiple yeast chromatin-associated mutants at once. The simultaneous profiling of epigenetic marks in the mutants is achieved by multiplex immuno-precipitation of barcoded chromatin samples.},
  author = {Chabbert, Christophe D and Adjalley, Sophie H and Klaus, Bernd and Fritsch, Emilie S and Gupta, Ishaan and Pelechano, Vicent and Steinmetz, Lars M},
  date = {2015},
  doi = {10/f2zhr9},
  eprint = {25583149},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Chabbert et al. - 2015 - A high‐throughput C h IP ‐ S eq for large‐scale ch.pdf},
  issn = {1744-4292},
  journaltitle = {Molecular Systems Biology},
  keywords = {chip-seq,chromatin,high-throughput,histone,histone marks},
  number = {1},
  pages = {777},
  title = {A High-throughput {{C}} h {{IP}} - {{S}} Eq for Large-scale Chromatin Studies},
  volume = {11}
}

@article{Chabbert2016,
  abstract = {The genome-wide study of epigenetic states requires the integrative analysis of histone modification ChIP-seq data. Here, we introduce an easy-to-use analytic framework to compare profiles of enrichment in histone modifications around classes of genomic elements, e.g. transcription start sites (TSS). Our framework is available via the user-friendly R/Bioconductor package DChIPRep. DChIPRep uses biological replicate information as well as chromatin Input data to allow for a rigorous assessment of differential enrichment. DChIPRep is available for download through the Bioconductor project at http://bioconductor.org/packages/DChIPRep.},
  author = {Chabbert, Christophe D. and Steinmetz, Lars M. and Klaus, Bernd},
  date = {2016-04-26},
  doi = {10/ggcxj4},
  eprint = {27168989},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Chabbert et al. - 2016 - DChIPRep, an RBioconductor package for differenti.pdf},
  issn = {21678359},
  journaltitle = {PeerJ},
  keywords = {Bioinformatics,ChiP-seq,Chromatin,Computational biology,Differential enrichment,Genomics,Histone-modifications,Statistics},
  number = {4},
  pages = {e1981},
  title = {{{DChIPRep}}, an {{R}}/{{Bioconductor}} Package for Differential Enrichment Analysis in Chromatin Studies},
  volume = {2016}
}

@collection{chambersStatisticalModels1992,
  date = {1992},
  doi = {10/gf5g89},
  edition = {1},
  editor = {Chambers, John M. and Hastie, Trevor J.},
  ids = {chambers:1992},
  isbn = {978-0-203-73853-5},
  keywords = {S models statistical statistics},
  langid = {english},
  publisher = {{Routledge}},
  title = {Statistical {{Models}} in {{S}}},
  url = {https://www.taylorfrancis.com/books/e/9780203738535}
}

@article{Champagne2014,
  abstract = {Land cover and land use classifications from remote sensing are increasingly becoming institutionalized framework data sets for monitoring environmental change. As such, the need for robust statements of classification accuracy is critical. This paper describes a method to estimate confidence in classification model accuracy using a bootstrap approach. Using this method, it was found that classification accuracy and confidence, while closely related, can be used in complementary ways to provide additional information on map accuracy and define groups of classes and to inform the future reference sampling strategies. Overall classification accuracy increases with an increase in the number of fields surveyed, where the width of classification confidence bounds decreases. Individual class accuracies and confidence were non-linearly related to the number of fields surveyed. Results indicate that some classes can be estimated accurately and confidently with fewer numbers of samples, whereas others require larger reference data sets to achieve satisfactory results. This approach is an improvement over other approaches for estimating class accuracy and confidence as it uses repetitive sampling to produce a more realistic estimate of the range in classification accuracy and confidence that can be obtained with different reference data inputs. ?? 2014 Published by Elsevier B.V.},
  author = {Champagne, Catherine and McNairn, Heather and Daneshfar, Bahram and Shang, Jiali},
  date = {2014},
  doi = {10/f5v533},
  file = {/Users/ryan/Documents/Zotero Library/Champagne et al. - 2014 - A bootstrap method for assessing classification ac.pdf},
  issn = {15698432},
  journaltitle = {International Journal of Applied Earth Observation and Geoinformation},
  number = {1},
  pages = {44-52},
  title = {A Bootstrap Method for Assessing Classification Accuracy and Confidence for Agricultural Land Use Mapping in {{Canada}}},
  volume = {29}
}

@article{Chang2008,
  abstract = {BACKGROUND: Alternative RNA splicing greatly increases proteome diversity and thereby contribute to species- or tissue-specific functions. The possibility to study alternative splicing (AS) events on a genomic scale using splicing-sensitive microarrays, including the Affymetrix GeneChip Exon 1.0 ST microarray (exon array), has appeared very recently. However, the application of this new technology is hindered by the lack of free and user-friendly software devoted to these novel platforms.

RESULTS: In this study we present a Java-based freeware, easyExon http://microarray.ym.edu.tw/easyexon, to process, filtrate and visualize exon array data with an analysis pipeline. This tool implements the most commonly used probeset summarization methods as well as AS-orientated filtration algorithms, e.g. MIDAS and PAC, for the detection of alternative splicing events. We include a biological filtration function according to GO terms, and provide a module to visualize and interpret the selected exons and transcripts. Furthermore, easyExon can integrate with other related programs, such as Integrate Genome Browser (IGB) and Affymetrix Power Tools (APT), to make the whole analysis more comprehensive. We applied easyExon on a public accessible colon cancer dataset as an example to illustrate the analysis pipeline of this tool.

CONCLUSION: EasyExon can efficiently process and analyze the Affymetrix exon array data. The simplicity, flexibility and brevity of easyExon make it a valuable tool for AS event identification in genomic research.},
  author = {Chang, Ting-Yu and Li, Yin-Yi and Jen, Chih-Hung and Yang, Tsun-Po and Lin, Chi-Hung and Hsu, Ming-Ta and Wang, Hsei-Wei},
  date = {2008-01},
  doi = {10/fqf9jm},
  eprint = {18851762},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Chang et al. - 2008 - easyExon--a Java-based GUI tool for processing and.pdf},
  issn = {1471-2105},
  journaltitle = {BMC bioinformatics},
  keywords = {Alternative Splicing,Alternative Splicing: genetics,Animals,Exons,Gene Expression Profiling,Gene Expression Profiling: methods,Humans,Information Storage and Retrieval,Information Storage and Retrieval: methods,Mice,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,Rats,User-Computer Interface},
  pages = {432},
  title = {{{easyExon}}--a {{Java}}-Based {{GUI}} Tool for Processing and Visualization of {{Affymetrix}} Exon Array Data.},
  volume = {9}
}

@article{Chen2007,
  abstract = {Orthology detection is critically important for accurate functional annotation, and has been widely used to facilitate studies on comparative and evolutionary genomics. Although various methods are now available, there has been no comprehensive analysis of performance, due to the lack of a genomic-scale 'gold standard' orthology dataset. Even in the absence of such datasets, the comparison of results from alternative methodologies contains useful information, as agreement enhances confidence and disagreement indicates possible errors. Latent Class Analysis (LCA) is a statistical technique that can exploit this information to reasonably infer sensitivities and specificities, and is applied here to evaluate the performance of various orthology detection methods on a eukaryotic dataset. Overall, we observe a trade-off between sensitivity and specificity in orthology detection, with BLAST-based methods characterized by high sensitivity, and tree-based methods by high specificity. Two algorithms exhibit the best overall balance, with both sensitivity and specificity{$>$}80\%: INPARANOID identifies orthologs across two species while OrthoMCL clusters orthologs from multiple species. Among methods that permit clustering of ortholog groups spanning multiple genomes, the (automated) OrthoMCL algorithm exhibits better within-group consistency with respect to protein function and domain architecture than the (manually curated) KOG database, and the homolog clustering algorithm TribeMCL as well. By way of using LCA, we are also able to comprehensively assess similarities and statistical dependence between various strategies, and evaluate the effects of parameter settings on performance. In summary, we present a comprehensive evaluation of orthology detection on a divergent set of eukaryotic genomes, thus providing insights and guides for method selection, tuning and development for different applications. Many biological questions have been addressed by multiple tests yielding binary (yes/no) outcomes but no clear definition of truth, making LCA an attractive approach for computational biology.},
  author = {Chen, Feng and Mackey, Aaron J and Vermunt, Jeroen K and Roos, David S},
  date = {2007-01},
  doi = {10/bbkkn3},
  eprint = {17440619},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Chen et al. - 2007 - Assessing performance of orthology detection strat.pdf},
  issn = {1932-6203},
  journaltitle = {PloS one},
  keywords = {Algorithms,Eukaryotic Cells,Genome},
  number = {4},
  pages = {e383},
  title = {Assessing Performance of Orthology Detection Strategies Applied to Eukaryotic Genomes.},
  volume = {2}
}

@article{Chen2007a,
  abstract = {Background: Cowpea [Vigna unguiculata (L.) Walp.] is one of the most important food and forage legumes in the semi-arid tropics because of its ability to tolerate drought and grow on poor soils. It is cultivated mostly by poor farmers in developing countries, with 80\% of production taking place in the dry savannah of tropical West and Central Africa. Cowpea is largely an underexploited crop with relatively little genomic information available for use in applied plant breeding. The goal of the Cowpea Genomics Initiative (CGI), funded by the Kirkhouse Trust, a UK-based charitable organization, is to leverage modern molecular genetic tools for gene discovery and cowpea improvement. One aspect of the initiative is the sequencing of the gene-rich region of the cowpea genome (termed the genespace) recovered using methylation filtration technology and providing annotation and analysis of the sequence data. Description: CGKB, Cowpea Genespace/Genomics Knowledge Base, is an annotation knowledge base developed under the CGI. The database is based on information derived from 298,848 cowpea genespace sequences (GSS) isolated by methylation filtering of genomic DNA. The CGKB consists of three knowledge bases: GSS annotation and comparative genomics knowledge base, GSS enzyme and metabolic pathway knowledge base, and GSS simple sequence repeats (SSRs) knowledge base for molecular marker discovery. A homology-based approach was applied for annotations of the GSS, mainly using BLASTX against four public FASTA formatted protein databases (NCBI GenBank Proteins, UniProtKB-Swiss-Prot, UniprotKB-PIR (Protein Information Resource), and UniProtKB-TrEMBL). Comparative genome analysis was done by BLASTX searches of the cowpea GSS against four plant proteomes from Arabidopsis thaliana, Oryza sativa, Medicago truncatula, and Populus trichocarpa. The possible exons and introns on each cowpea GSS were predicted using the HMM-based Genscan gene predication program and the potential domains on annotated GSS were analyzed using the HMMER package against the Pfam database. The annotated GSS were also assigned with Gene Ontology annotation terms and integrated with 228 curated plant metabolic pathways from the Arabidopsis Information Resource (TAIR) knowledge base. The UniProtKB-Swiss-Prot ENZYME database was used to assign putative enzymatic function to each GSS. Each GSS was also analyzed with the Tandem Repeat Finder (TRF) program in order to identify potential SSRs for molecular marker discovery. The raw sequence data, processed annotation, and SSR results were stored in relational tables designed in key-value pair fashion using a PostgreSQL relational database management system. The biological knowledge derived from the sequence data and processed results are represented as views or materialized views in the relational database management system. All materialized views are indexed for quick data access and retrieval. Data processing and analysis pipelines were implemented using the Perl programming language. The web interface was implemented in JavaScript and Perl CGI running on an Apache web server. The CPU intensive data processing and analysis pipelines were run on a computer cluster of more than 30 dual-processor Apple XServes. A job management system called Vela was created as a robust way to submit large numbers of jobs to the Portable Batch System (PBS). Conclusion: CGKB is an integrated and annotated resource for cowpea GSS with features of homology-based and HMM-based annotations, enzyme and pathway annotations, GO term annotation, toolkits, and a large number of other facilities to perform complex queries. The cowpea GSS, chloroplast sequences, mitochondrial sequences, retroelements, and SSR sequences are available as FASTA formatted files and downloadable at CGKB. \textcopyright{} 2007 Chen et al; licensee BioMed Central Ltd.},
  author = {Chen, Xianfeng and Laudeman, Thomas W. and Rushton, Paul J. and Spraggins, Thomas A. and Timko, Michael P.},
  date = {2007},
  doi = {10/b8nt58},
  file = {/Users/ryan/Documents/Zotero Library/Chen et al. - 2007 - CGKB An annotation knowledge base for cowpea (Vig.pdf},
  issn = {14712105},
  journaltitle = {BMC Bioinformatics},
  number = {1},
  pages = {129},
  title = {{{CGKB}}: {{An}} Annotation Knowledge Base for Cowpea ({{Vigna}} Unguiculata {{L}}.) Methylation Filtered Genomic Genespace Sequences},
  volume = {8}
}

@article{Chen2012,
  abstract = {Personalized medicine is expected to benefit from combining genomic information with regular monitoring of physiological states by multiple high-throughput methods. Here, we present an integrative personal omics profile (iPOP), an analysis that combines genomic, transcriptomic, proteomic, metabolomic, and autoantibody profiles from a single individual over a 14 month period. Our iPOP analysis revealed various medical risks, including type 2 diabetes. It also uncovered extensive, dynamic changes in diverse molecular components and biological pathways across healthy and diseased conditions. Extremely high-coverage genomic and transcriptomic data, which provide the basis of our iPOP, revealed extensive heteroallelic changes during healthy and diseased states and an unexpected RNA editing mechanism. This study demonstrates that longitudinal iPOP can be used to interpret healthy and diseased states by connecting genomic information with additional dynamic omics activity. \textcopyright{} 2012 Elsevier Inc.},
  author = {Chen, Rui and Mias, George I. and Li-Pook-Than, Jennifer and Jiang, Lihua and Lam, Hugo Y.K. and Chen, Rong and Miriami, Elana and Karczewski, Konrad J. and Hariharan, Manoj and Dewey, Frederick E. and Cheng, Yong and Clark, Michael J. and Im, Hogune and Habegger, Lukas and Balasubramanian, Suganthi and O'Huallachain, Maeve and Dudley, Joel T. and Hillenmeyer, Sara and Haraksingh, Rajini and Sharon, Donald and Euskirchen, Ghia and Lacroute, Phil and Bettinger, Keith and Boyle, Alan P. and Kasowski, Maya and Grubert, Fabian and Seki, Scott and Garcia, Marco and Whirl-Carrillo, Michelle and Gallardo, Mercedes and Blasco, Maria A. and Greenberg, Peter L. and Snyder, Phyllis and Klein, Teri E. and Altman, Russ B. and Butte, Atul J. and Ashley, Euan A. and Gerstein, Mark and Nadeau, Kari C. and Tang, Hua and Snyder, Michael},
  date = {2012-03},
  doi = {10/gcpgfm},
  file = {/Users/ryan/Documents/Zotero Library/Chen et al. - 2012 - Personal omics profiling reveals dynamic molecular.pdf},
  issn = {10974172},
  journaltitle = {Cell},
  number = {6},
  pages = {1293-1307},
  title = {Personal Omics Profiling Reveals Dynamic Molecular and Medical Phenotypes},
  volume = {148}
}

@article{Chen2013,
  abstract = {Identifying transcription factors (TF) involved in producing a genome-wide transcriptional profile is an essential step in building mechanistic model that can explain observed gene expression data. We developed a statistical framework for constructing genome-wide signatures of TF activity, and for using such signatures in the analysis of gene expression data produced by complex transcriptional regulatory programs. Our framework integrates ChIP-seq data and appropriately matched gene expression profiles to identify True REGulatory (TREG) TF-gene interactions. It provides genome-wide quantification of the likelihood of regulatory TF-gene interaction that can be used to either identify regulated genes, or as genome-wide signature of TF activity. To effectively use ChIP-seq data, we introduce a novel statistical model that integrates information from all binding ``peaks'' within 2 Mb window around a gene's transcription start site (TSS), and provides gene-level binding scores and probabilities of regulatory interaction. In the second step we integrate these binding scores and regulatory probabilities with gene expression data to assess the likelihood of True REGulatory (TREG) TF-gene interactions. We demonstrate the advantages of TREG framework in identifying genes regulated by two TFs with widely different distribution of functional binding events (ER{$\alpha$} and E2f1). We also show that TREG signatures of TF activity vastly improve our ability to detect involvement of ER{$\alpha$} in producing complex diseases-related transcriptional profiles. Through a large study of disease-related transcriptional signatures and transcriptional signatures of drug activity, we demonstrate that increase in statistical power associated with the use of TREG signatures makes the crucial difference in identifying key targets for treatment, and drugs to use for treatment. All methods are implemented in an open-source R package treg. The package also contains all data used in the analysis including 494 TREG binding profiles based on ENCODE ChIP-seq data. The treg package can be downloaded at http://GenomicsPortals.org.},
  author = {Chen, Jing and Hu, Zhen and Phatak, Mukta and Reichard, John and Freudenberg, Johannes M. and Sivaganesan, Siva and Medvedovic, Mario},
  date = {2013-09-05},
  doi = {10/ggcxj6},
  editor = {Morris, Quaid},
  file = {/Users/ryan/Documents/Zotero Library/Chen et al. - 2013 - Genome-Wide Signatures of Transcription Factor Act.pdf},
  issn = {1553-7358},
  journaltitle = {PLoS Computational Biology},
  number = {9},
  pages = {e1003198},
  title = {Genome-{{Wide Signatures}} of {{Transcription Factor Activity}}: {{Connecting Transcription Factors}}, {{Disease}}, and {{Small Molecules}}},
  volume = {9}
}

@article{Chen2016,
  abstract = {Recent advances in sequencing technology have opened a new era in RNA studies. Novel types of RNAs such as long non-coding RNAs (lncRNAs) have been discovered by transcriptomic sequencing and some lncRNAs have been found to play essential roles in biological processes. However, only limited information is available for lncRNAs in Drosophila melanogaster, an important model organism. Therefore, the characterization of lncRNAs and identification of new lncRNAs in D. melanogaster is an important area of research. Moreover, there is an increasing interest in the use of ChIP-seq data (H3K4me3, H3K36me3 and Pol II) to detect signatures of active transcription for reported lncRNAs. We have developed a computational approach to identify new lncRNAs from two tissue-specific RNA-seq datasets using the poly(A)-enriched and the ribo-zero method, respectively. In our results, we identified 462 novel lncRNA transcripts, which we combined with 4137 previously published lncRNA transcripts into a curated dataset. We then utilized 61 RNA-seq and 32 ChIP-seq datasets to improve the annotation of the curated lncRNAs with regards to transcriptional direction, exon regions, classification, expression in the brain, possession of a poly(A) tail, and presence of conventional chromatin signatures. Furthermore, we used 30 time-course RNA-seq datasets and 32 ChIP-seq datasets to investigate whether the lncRNAs reported by RNA-seq have active transcription signatures. The results showed that more than half of the reported lncRNAs did not have chromatin signatures related to active transcription. To clarify this issue, we conducted RT-qPCR experiments and found that \textasciitilde{}95.24~\% of the selected lncRNAs were truly transcribed, regardless of whether they were associated with active chromatin signatures or not. In this study, we discovered a large number of novel lncRNAs, which suggests that many remain to be identified in D. melanogaster. For the lncRNAs that are known, we improved their characterization by integrating a large number of sequencing datasets (93 sets in total) from multiple sources (lncRNAs, RNA-seq and ChIP-seq). The RT-qPCR experiments demonstrated that RNA-seq is a reliable platform to discover lncRNAs. This set of curated lncRNAs with improved annotations can serve as an important resource for investigating the function of lncRNAs in D. melanogaster.},
  author = {Chen, Mei Ju May and Chen, Li Kai and Lai, Yu Shing and Lin, Yu Yu and Wu, Dung Chi and Tung, Yi An and Liu, Kwei Yan and Shih, Hsueh Tzu and Chen, Yi Jyun and Lin, Yan Liang and Ma, Li Ting and Huang, Jian Long and Wu, Po Chun and Hong, Ming Yi and Chu, Fang Hua and Wu, June Tai and Li, Wen Hsiung and Chen, Chien Yu},
  date = {2016},
  doi = {10/f8vn3n},
  eprint = {26969372},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Chen et al. - 2016 - Integrating RNA-seq and ChIP-seq data to character.pdf},
  isbn = {1471-2164 (Electronic)\textbackslash{}r1471-2164 (Linking)},
  issn = {14712164},
  journaltitle = {BMC Genomics},
  keywords = {Active transcription,ChIP-seq,Drosophila melanogaster,Long non-coding RNA,RNA-seq},
  number = {1},
  pages = {1-14},
  title = {Integrating {{RNA}}-Seq and {{ChIP}}-Seq Data to Characterize Long Non-Coding {{RNAs}} in {{Drosophila}} Melanogaster},
  volume = {17}
}

@incollection{chenDifferentialExpressionAnalysis2014,
  abstract = {This article reviews the statistical theory underlying the edgeR software package for differential expression of RNA-seq data. Negative binomial models are used to capture the quadratic mean-variance relationship that can be observed in RNA-seq data. Conditional likelihood methods are used to avoid bias when estimating the level of variation. Empirical Bayes methods are used to allow gene-specific variation estimates even when the number of replicate samples is very small. Generalized linear models are used to accommodate arbitrarily complex designs. A key feature of the edgeR package is the use of weighted likelihood methods to implement a flexible empirical Bayes approach in the absence of easily tractable sampling distributions. The methodology is implemented in flexible software that is easy to use even for users who are not professional statisticians or bioinformaticians. The software is part of the Bioconductor project. This article describes some recently implemented features. Loess-style weighting is used to improve the weighted likelihood approach, and an analogy with quasi- likelihood is used to estimate the optimal weight to be given to the empirical Bayes prior. The article includes a fully worked case study with complete code. 1},
  author = {Chen, Yunshun and Lun, Aaron T. L. and Smyth, Gordon K.},
  booktitle = {Statistical {{Analysis}} of {{Next Generation Sequencing Data}}},
  date = {2014},
  doi = {10/dd8n},
  editor = {Datta, Somnath and Nettleton, Dan},
  file = {/Users/ryan/Documents/Zotero Library/Chen et al. - 2014 - Differential Expression Analysis of Complex RNA-se.pdf},
  ids = {Chen2014},
  isbn = {978-3-319-07212-8},
  location = {{Cham}},
  pages = {51-74},
  publisher = {{Springer International Publishing}},
  title = {Differential {{Expression Analysis}} of {{Complex RNA}}-Seq {{Experiments Using edgeR}}},
  url = {http://link.springer.com/10.1007/978-3-319-07212-8_3}
}

@article{chenGeneOntologyBased2014,
  abstract = {RNA-seq analysis provides a powerful tool for revealing relationships between gene expression level and biological function of proteins. In order to identify differentially expressed genes among various RNA-seq datasets obtained from different experimental designs, an appropriate normalization method for calibrating multiple experimental datasets is the first challenging problem. We propose a novel method to facilitate biologists in selecting a set of suitable housekeeping genes for inter-sample normalization. The approach is achieved by adopting user defined experimentally related keywords, GO annotations, GO term distance matrices, orthologous housekeeping gene candidates, and stability ranking of housekeeping genes. By identifying the most distanced GO terms from query keywords and selecting housekeeping gene candidates with low coefficients of variation among different spatio-temporal datasets, the proposed method can automatically enumerate a set of functionally irrelevant housekeeping genes for pratical normalization. Novel and benchmark testing RNA-seq datasets were applied to demostrate that different selections of housekeeping gene lead to strong impact on differential gene expression analysis, and compared results have shown that our proposed method outperformed other traditional approaches in terms of both sensitivity and specificity. The proposed mechanism of selecting appropriate houskeeping genes for inter-dataset normalization is robust and accurate for differential expression analyses.},
  author = {Chen, Chien-Ming and Lu, Yu-Lun and Sio, Chi-Pong and Wu, Guan-Chung and Tzou, Wen-Shyong and Pai, Tun-Wen},
  date = {2014-02-18},
  doi = {10/ggcxj5},
  eprint = {24561167},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Chen et al. - 2014 - Gene Ontology Based Housekeeping Gene Selection fo.pdf},
  issn = {1095-9130},
  issue = {February},
  journaltitle = {Methods (San Diego, Calif.)},
  title = {Gene {{Ontology Based Housekeeping Gene Selection}} for {{RNA}}-Seq {{Normalization}}.}
}

@article{Chevreux2004,
  abstract = {We present an EST sequence assembler that specializes in reconstruction of pristine mRNA transcripts, while at the same time detecting and classifying single nucleotide polymorphisms (SNPs) occuring in different variations thereof. The assembler uses iterative multipass strategies centered on high-confidence regions within sequences and has a fallback strategy for using low-confidence regions when needed. It features special functions to assemble high numbers of highly similar sequences without prior masking, an automatic editor that edits and analyzes alignments by inspecting the underlying traces, and detection and classification of sequence properties like SNPs with a high specificity and a sensitivity down to one mutation per sequence. In addition, it includes possibilities to use incorrectly preprocessed sequences, routines to make use of additional sequencing information such as base-error probabilities, template insert sizes, strain information, etc., and functions to detect and resolve possible misassemblies. The assembler is routinely used for such various tasks as mutation detection in different cell types, similarity analysis of transcripts between organisms, and pristine assembly of sequences from various sources for oligo design in clinical microarray experiments. \textcopyright{}2004 by Cold Spring Harbor Laboratory Press.},
  author = {Chevreux, Bastien and Pfisterer, Thomas and Drescher, Bernd and Driesel, Albert J. and M\"uller, Werner E.G. and Wetter, Thomas and Suhai, S\'andor},
  date = {2004},
  doi = {10/bz255c},
  file = {/Users/ryan/Documents/Zotero Library/Chevreux et al. - 2004 - Using the miraEST assembler for reliable and autom.pdf},
  issn = {10889051},
  journaltitle = {Genome Research},
  number = {6},
  pages = {1147-1159},
  title = {Using the {{miraEST}} Assembler for Reliable and Automated {{mRNA}} Transcript Assembly and {{SNP}} Detection in Sequenced {{ESTs}}},
  volume = {14}
}

@thesis{chevreux2005mira,
  author = {Chevreux, B},
  date = {2005},
  file = {/Users/ryan/Zotero/storage/9WQRY4EN/MIRA thesis.warc},
  institution = {{German Cancer Research Center Heidelberg}},
  keywords = {\#nosource},
  pagetotal = {322},
  title = {{{MIRA}}: An {{Automated Genome}} and {{EST Assembler}}},
  url = {http://chevreux.org/thesis/}
}

@article{Chicco2017,
  author = {Chicco, Davide and Grassi, Elena and Gonnella, Giorgio and Giacomoni, Franck and Clarke, Erik and Blankenberg, Daniel and Tran, Camy and Laurent, Sacha and Gopez, Matthew and Sennblad, Bengt and Baaijens, Jasmijn A and Ewels, Philip and Wright, Patrick R and Enache, Oana M and Roger, Pierrick and Dampier, Will and Koppstein, David and Devisetty, Upendra Kumar and Rausch, Tobias and Salatino, Adrian Emanuel and Seiler, Julien and Jung, Matthieu and Cumbo, Fabio and Moskalenko, Oleksandr and Bogema, Daniel R and Workentine, Matthew L and Newhouse, Stephen J and Leprevost, Veiga and Arvai, Kevin and Diseases, Kidney and States, United and Group, Bioinformatics},
  date = {2017},
  doi = {10/gcjkpk},
  file = {/Users/ryan/Documents/Zotero Library/Chicco et al. - 2017 - Bioconda A sustainable and comprehensive software.pdf},
  pages = {1-12},
  title = {Bioconda: {{A}} Sustainable and Comprehensive Software Distribution for the Life Sciences}
}

@article{Choi2014,
  abstract = {Background: Transcriptome analysis of porcine whole blood has several applications, which include deciphering genetic mechanisms for host responses to viral infection and vaccination. The abundance of alpha- and beta-globin transcripts in blood, however, impedes the ability to cost-effectively detect transcripts of low abundance. Although protocols exist for reduction of globin transcripts from human and mouse/rat blood, preliminary work demonstrated these are not useful for porcine blood Globin Reduction (GR). Our objectives were to develop a porcine specific GR protocol and to evaluate the GR effects on gene discovery and sequence read coverage in RNA-sequencing (RNA-seq) experiments. Results: A GR protocol for porcine blood samples was developed using RNase H with antisense oligonucleotides specifically targeting porcine hemoglobin alpha (HBA) and beta (HBB) mRNAs. Whole blood samples (n = 12) collected in Tempus tubes were used for evaluating the efficacy and effects of GR on RNA-seq. The HBA and HBB mRNA transcripts comprised an average of 46.1\% of the mapped reads in pre-GR samples, but those reads reduced to an average of 8.9\% in post-GR samples. Differential gene expression analysis showed that the expression level of 11,046 genes were increased, whereas 34 genes, excluding HBA and HBB, showed decreased expression after GR (FDR {$<$} 0.05). An additional 815 genes were detected only in post-GR samples. Conclusions: Our porcine specific GR primers and protocol minimize the number of reads of globin transcripts in whole blood samples and provides increased coverage as well as accuracy and reproducibility of transcriptome analysis. Increased detection of low abundance mRNAs will ensure that studies relying on transcriptome analyses do not miss information that may be vital to the success of the study.},
  author = {Choi, Igseo and Bao, Hua and Kommadath, Arun and Hosseini, Afshin and Sun, Xu and Meng, Yan and Stothard, Paul and Plastow, Graham S. and Tuggle, Christopher K. and Reecy, James M. and Fritz-Waters, Eric and Abrams, Samuel M. and Lunney, Joan K. and Guan, Le Luo},
  date = {2014},
  doi = {10/gb3g9j},
  file = {/Users/ryan/Documents/Zotero Library/Choi et al. - 2014 - Increasing gene discovery and coverage using RNA-s.pdf},
  issn = {14712164},
  journaltitle = {BMC Genomics},
  keywords = {Blood,Globin reduction,Pig,RNA-seq,Transcriptome},
  number = {1},
  pages = {1-10},
  title = {Increasing Gene Discovery and Coverage Using {{RNA}}-Seq of Globin {{RNA}} Reduced Porcine Blood Samples},
  volume = {15}
}

@article{Chuen,
  author = {Chuen, By and Tan, Seng},
  file = {/Users/ryan/Documents/Zotero Library/Chuen and Tan - Additional file 1  Correlating gene and protein e.pdf},
  pages = {1-16},
  title = {Additional File 1 : {{Correlating}} Gene and Protein Expres- Sion Data Using {{Correlated Factor Analysis}}}
}

@article{Chung2014,
  abstract = {The Aspergillus fumigatus sterol regulatory element binding protein (SREBP) SrbA belongs to the basic Helix-Loop-Helix (bHLH) family of transcription factors and is crucial for antifungal drug resistance and virulence. The latter phenotype is especially striking, as loss of SrbA results in complete loss of virulence in murine models of invasive pulmonary aspergillosis (IPA). How fungal SREBPs mediate fungal virulence is unknown, though it has been suggested that lack of growth in hypoxic conditions accounts for the attenuated virulence. To further understand the role of SrbA in fungal infection site pathobiology, chromatin immunoprecipitation followed by massively parallel DNA sequencing (ChIP-seq) was used to identify genes under direct SrbA transcriptional regulation in hypoxia. These results confirmed the direct regulation of ergosterol biosynthesis and iron uptake by SrbA in hypoxia and revealed new roles for SrbA in nitrate assimilation and heme biosynthesis. Moreover, functional characterization of an SrbA target gene with sequence similarity to SrbA identified a new transcriptional regulator of the fungal hypoxia response and virulence, SrbB. SrbB co-regulates genes involved in heme biosynthesis and demethylation of C4-sterols with SrbA in hypoxic conditions. However, SrbB also has regulatory functions independent of SrbA including regulation of carbohydrate metabolism. Loss of SrbB markedly attenuates A. fumigatus virulence, and loss of both SREBPs further reduces in vivo fungal growth. These data suggest that both A. fumigatus SREBPs are critical for hypoxia adaptation and virulence and reveal new insights into SREBPs' complex role in infection site adaptation and fungal virulence.},
  author = {Chung, Dawoon and Barker, Bridget M. and Carey, Charles C. and Merriman, Brittney and Werner, Ernst R. and Lechner, Beatrix E. and Dhingra, Sourabh and Cheng, Chao and Xu, Wenjie and Blosser, Sara J. and Morohashi, Kengo and Mazurie, Aur\'elien and Mitchell, Thomas K. and Haas, Hubertus and Mitchell, Aaron P. and Cramer, Robert A.},
  date = {2014},
  doi = {10/f6rh5r},
  eprint = {25375670},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Chung et al. - 2014 - ChIP-seq and In Vivo Transcriptome Analyses of the.pdf},
  isbn = {1553-7366\textbackslash{}r1553-7374},
  issn = {15537374},
  journaltitle = {PLoS Pathogens},
  number = {11},
  title = {{{ChIP}}-Seq and {{In Vivo Transcriptome Analyses}} of the {{Aspergillus}} Fumigatus {{SREBP SrbA Reveals}} a {{New Regulator}} of the {{Fungal Hypoxia Response}} and {{Virulence}}},
  volume = {10}
}

@article{Chung2015,
  author = {Chung, N. C. and Storey, J. D.},
  date = {2015},
  doi = {10/f63qzh},
  file = {/Users/ryan/Documents/Zotero Library/Chung and Storey - 2015 - Statistical significance of variables driving syst.pdf},
  issn = {1367-4803},
  journaltitle = {Bioinformatics},
  number = {4},
  pages = {545-554},
  title = {Statistical Significance of Variables Driving Systematic Variation in High-Dimensional Data},
  volume = {31}
}

@article{Churchill2002,
  abstract = {Microarray technology is now widely available and is being applied to address increasingly complex scientific questions. Consequently, there is a greater demand for statistical assessment of the conclusions drawn from microarray experiments. This review discusses fundamental issues of how to design an experiment to ensure that the resulting data are amenable to statistical analysis. The discussion focuses on two-color spotted cDNA microarrays, but many of the same issues apply to single-color gene-expression assays as well.},
  author = {a Churchill, Gary},
  date = {2002-12},
  doi = {10/b7sg6b},
  eprint = {12454643},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Churchill - 2002 - Fundamentals of experimental design for cDNA micro.pdf},
  issn = {1061-4036},
  issue = {december},
  journaltitle = {Nature genetics},
  keywords = {Animals,DNA; Complementary,DNA; Complementary: analysis,Gene Expression,Gene Expression Profiling,Gene Expression Profiling: methods,Mice,Models; Biological,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,Reference Standards,Reproducibility of Results,Research Design,Statistics as Topic},
  pages = {490-5},
  title = {Fundamentals of Experimental Design for {{cDNA}} Microarrays.},
  volume = {32 Suppl}
}

@article{Clark2013,
  abstract = {Empirical analyses in social science frequently confront quantitative data that are clustered or grouped. To account for group-level variation and improve model fit, researchers will commonly specify either a fixed- or random-effects model. But current advice on which approach should be preferred, and under what conditions, remains vague and sometimes contradictory. This study performs a series of Monte Carlo simulations to evaluate the total error due to bias and variance in the inferences of each model, for typical sizes and types of datasets encountered in applied research. The results offer a typology of dataset characteristics to help researchers choose a preferred model.},
  author = {Clark, Tom S. and Linzer, Drew A.},
  date = {2015-05},
  doi = {10/gdj7jw},
  file = {/Users/ryan/Documents/Zotero Library/Clark and Linzer - 2015 - Should I Use Fixed or Random Effects.pdf},
  ids = {Clark2013},
  issn = {2049-8470, 2049-8489},
  journaltitle = {Political Science Research and Methods},
  langid = {english},
  number = {2},
  pages = {399-408},
  shortjournal = {PSRM},
  title = {Should {{I Use Fixed}} or {{Random Effects}}?},
  volume = {3}
}

@article{Clark2014,
  author = {Clark, Neil R and Hu, Kevin S and Feldmann, Axel S and Kou, Yan and Chen, Edward Y and Duan, Qiaonan and Therapeutics, Systems and Biology, Systems and Ma'ayan, Avi},
  date = {2014},
  doi = {10/gb8wmp},
  file = {/Users/ryan/Documents/Zotero Library/Clark et al. - 2014 - The characteristic direction a geometrical approa.pdf},
  issn = {1471-2105},
  journaltitle = {BMC Bioinformatics},
  number = {1},
  pages = {79},
  title = {The Characteristic Direction: A Geometrical Approach to Identify Differentially Expressed Genes},
  volume = {15}
}

@article{Cline2005,
  abstract = {MOTIVATION: Many or most mammalian genes undergo alternative splicing, generating a variety of transcripts from a single gene. New information on splice variation is becoming available through technology for measuring expression levels of several exons or splice junctions per gene. We have developed a statistical method, ANalysis Of Splice VAriation (ANOSVA) to detect alternative splicing from expression data. Since ANOSVA requires no transcript information, it can be applied when the level of annotation is poor. When validated against spiked clone data, it generated no false positives and few false negatives. We demonstrated ANOSVA with data from a prototype mouse alternative splicing array, run against normal adult tissues, yielding a set of genes with evidence of tissue-specific splice variation.

AVAILABILITY: The results are available at the supplementary information site.

SUPPLEMENTARY INFORMATION: The results are available at the supplementary information site https://bioinfo.affymetrix.com/Papers/ANOSVA/},
  author = {Cline, Melissa S and Blume, John and Cawley, Simon and a Clark, Tyson and Hu, Jing-Shan and Lu, Gang and Salomonis, Nathan and Wang, Hui and Williams, Alan},
  date = {2005-06},
  doi = {10/dgh8xh},
  eprint = {15961447},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Cline et al. - 2005 - ANOSVA a statistical method for detecting splice .pdf},
  issn = {1367-4803},
  journaltitle = {Bioinformatics (Oxford, England)},
  keywords = {Alternative Splicing,Animals,Computational Biology,Computational Biology: methods,Databases; Protein,False Positive Reactions,Gene Expression Profiling,Mice,Models; Statistical,Oligonucleotide Array Sequence Analysis,Reproducibility of Results,Software},
  pages = {i107-15},
  title = {{{ANOSVA}}: A Statistical Method for Detecting Splice Variation from Expression Data.},
  volume = {21 Suppl 1}
}

@article{Cline2007a,
  abstract = {Cytoscape is a free software package for visualizing, modeling and analyzing molecular and genetic interaction networks. This protocol explains how to use Cytoscape to analyze the results of mRNA expression profiling, and other functional genomics and proteomics experiments, in the context of an interaction network obtained for genes of interest. Five major steps are described: (i) obtaining a gene or protein network, (ii) displaying the network using layout algorithms, (iii) integrating with gene expression and other functional attributes, (iv) identifying putative complexes and functional modules and (v) identifying enriched Gene Ontology annotations in the network. These steps provide a broad sample of the types of analyses performed by Cytoscape.},
  author = {Cline, Melissa S and Smoot, Michael and Cerami, Ethan and Kuchinsky, Allan and Landys, Nerius and Workman, Chris and Christmas, Rowan and Avila-Campilo, Iliana and Creech, Michael and Gross, Benjamin and Hanspers, Kristina and Isserlin, Ruth and Kelley, Ryan and Killcoyne, Sarah and Lotia, Samad and Maere, Steven and Morris, John and Ono, Keiichiro and Pavlovic, Vuk and Pico, Alexander R and Vailaya, Aditya and Wang, Peng-Liang and Adler, Annette and Conklin, Bruce R and Hood, Leroy and Kuiper, Martin and Sander, Chris and Schmulevich, Ilya and Schwikowski, Benno and Warner, Guy J and Ideker, Trey and Bader, Gary D},
  date = {2007-01},
  doi = {10/bhxn6z},
  eprint = {17947979},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Cline et al. - 2007 - Integration of biological networks and gene expres.pdf},
  issn = {1750-2799},
  journaltitle = {Nature protocols},
  keywords = {Computational Biology,Computational Biology: methods,Gene Expression Profiling,Gene Expression Profiling: methods,Gene Regulatory Networks,Genomics,Genomics: methods,Proteomics,Proteomics: methods,RNA; Messenger,RNA; Messenger: metabolism,Software},
  number = {10},
  pages = {2366-82},
  title = {Integration of Biological Networks and Gene Expression Data Using {{Cytoscape}}.},
  volume = {2}
}

@incollection{Cohen1988,
  author = {Cohen, Jacob},
  booktitle = {Statistical {{Power Analysis}} for the {{Behavioral Sciences}}},
  date = {1988},
  edition = {2nd},
  editor = {{Hove} and {London}},
  file = {/Users/ryan/Documents/Zotero Library/Cohen - 1988 - The Analysis of Variance.pdf},
  isbn = {0-8058-0283-5},
  keywords = {statistics},
  location = {{Hillsdale, NJ}},
  pages = {273-407},
  publisher = {{Lawrence Erlbaum Associates}},
  title = {The {{Analysis}} of {{Variance}}}
}

@article{Cohn2019,
  abstract = {There is a sharp difference in how one views TCR structure-function-behaviour dependent on whether its recognition of major histocompatibility complex-encoded restriction elements (R) is germline selected or somatically generated. The generally accepted or Standard model is built on the assumption that recognition of R is by the V regions of the alphabeta TCR, which is not driven by allele specificity, whereas the competing model posits that recognition of R is allele-specific. The establishing of allele-specific recognition of R by the TCR would rule out the Standard model and clear the road to a consideration of a competing construct, the Tritope model. Here, the case for allele-specific recognition (germline selected) is detailed making it obvious that the Standard model is untenable. Copyright \textcopyright{} 2019 The Authors. Scandinavian Journal of Immunology published by John Wiley \& Sons Ltd on behalf of The Foundation for the Scandinavian Journal of Immunology.},
  author = {Cohn, Melvin and Anderson, Colin C. and Dembic, Zlatko},
  date = {2019},
  doi = {10/gf3kh5},
  file = {/Users/ryan/Documents/Zotero Library/Cohn et al. - 2019 - The case for allele‐specific recognition by the TC.pdf},
  issn = {0300-9475},
  journaltitle = {Scandinavian Journal of Immunology},
  number = {2},
  pages = {1-5},
  title = {The Case for Allele-specific Recognition by the {{TCR}}},
  volume = {90}
}

@article{Cole2016,
  abstract = {RNA-seq is now the technology of choice for genome-wide differential gene expression experiments, but it is not clear how many biological replicates are needed to ensure valid biological interpretation of the results or which statistical tools are best for analyzing the data. An RNA-seq experiment with 48 biological replicates in each of two conditions was performed to answer these questions and provide guidelines for experimental design. With three biological replicates, eight of the 11 tools evaluated found only 20\%-40\% of the significantly differentially expressed (SDE) genes identified with the full set of 42 clean replicates. This rises to {$>$}85\% for the subset of SDE genes changing in expression by more than fourfold. To achieve {$>$}85\% for all SDE genes regardless of fold change requires more than 20 biological replicates. The same eight tools successfully control their false discovery rate at {$\lessequivlnt$}5\% for all numbers of replicates, while the remaining three tools fail to control their FDR adequately, particularly for low numbers of replicates. For future RNA-seq experiments, these results suggest that more than six biological replicates should be used, rising to more than 12 when it is important to identify SDE genes for all fold changes. If less than 12 replicates are used, a superior combination of true positive and false positive performances makesedgeRthe leading tool. For higher replicate numbers, minimizing false positives is more important andDESeqmarginally outperforms the other tools.},
  author = {Schurch, Nicholas J and Schofield, Piet\'a and Gierli\'nski, Marek and Cole, Christian and Sherstnev, Alexander and Singh, Vijender and Wrobel, Nicola and Gharbi, Karim and Simpson, Gordon G and Owen-Hughes, Tom and Blaxter, Mark and Barton, Geoffrey J},
  date = {2016-06},
  doi = {10/f8mrmk},
  eprint = {27022035},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Schurch et al. - 2016 - How many biological replicates are needed in an RN.pdf},
  issn = {1355-8382},
  journaltitle = {RNA},
  keywords = {★,benchmarking,differential expression,experimental design,replication,rna-seq,RNA-seq,statistical power,yeast},
  number = {6},
  pages = {839-851},
  title = {How Many Biological Replicates Are Needed in an {{RNA}}-Seq Experiment and Which Differential Expression Tool Should You Use?},
  volume = {22}
}

@article{Colf2007,
  abstract = {{$\alpha\beta$} T cell receptors (TCRs) can crossreact with both self- and foreign- major histocompatibility complex (MHC) proteins in an enigmatic phenomenon termed alloreactivity. Here we present the 2.35 \AA{} structure of the 2C TCR complexed with its foreign ligand H-2Ld-QL9. Surprisingly, we find that this TCR utilizes a different strategy to engage the foreign pMHC in comparison to the manner in which it recognizes a self ligand H-2Kb-dEV8. 2C engages both shared and polymorphic residues on Ld and Kb, as well as the unrelated QL9 and dEV8 peptide antigens, in unique pair-wise contacts, resulting in greater structural complementarity with the Ld-QL9 complex. In the structure of an engineered, high-affinity 2C TCR variant bound to H-2Ld-QL9, the "wild-type" TCR-MHC binding orientation persists despite modified TCR-CDR3{$\alpha$} interactions with peptide. Thus, a single TCR recognizes two globally similar, but distinct ligands by divergent mechanisms, indicating that receptor-ligand crossreactivity can occur in the absence of molecular mimicry. \textcopyright{} 2007 Elsevier Inc. All rights reserved.},
  author = {Colf, Leremy A. and Bankovich, Alexander J. and Hanick, Nicole A. and Bowerman, Natalie A. and Jones, Lindsay L. and Kranz, David M. and Garcia, K. Christopher},
  date = {2007-04},
  doi = {10/cr86v6},
  file = {/Users/ryan/Documents/Zotero Library/Colf et al. - 2007 - How a Single T Cell Receptor Recognizes Both Self .pdf},
  issn = {00928674},
  journaltitle = {Cell},
  number = {1},
  pages = {135-146},
  title = {How a {{Single T Cell Receptor Recognizes Both Self}} and {{Foreign MHC}}},
  volume = {129}
}

@article{Collado-Torres2016,
  abstract = {recount is a resource of processed and summarized expression data spanning nearly 60,000 human RNA-seq samples from the Sequence Read Archive (SRA). The associated recount Bioconductor package provides a convenient API for querying, downloading, and analyzing the data. Each processed study consists of meta/phenotype data, the expression levels of genes and their underlying exons and splice junctions, and corresponding genomic annotation. We also provide data summarization types for quantifying novel transcribed sequence including base-resolution coverage and potentially unannotated splice junctions. We present workflows illustrating how to use recount to perform differential expression analysis including meta-analysis, annotation-free base-level analysis, and replication of smaller studies using data from larger studies. recount provides a valuable and user-friendly resource of processed RNA-seq datasets to draw additional biological insights from existing public data. The resource is available at https://jhubiostatistics.shinyapps.io/recount/.},
  author = {Collado-Torres, Leonardo and Nellore, Abhinav and Kammers, Kai and Ellis, Shannon E and Taub, Margaret A and Hansen, Kasper D and Jaffe, Andrew E and Langmead, Ben and Leek, Jeffrey},
  date = {2016},
  doi = {10/ggcxj7},
  file = {/Users/ryan/Documents/Zotero Library/Collado-Torres et al. - 2016 - recount A large-scale resource of analysis-ready .pdf},
  journaltitle = {bioRxiv},
  pages = {068478},
  title = {Recount: {{A}} Large-Scale Resource of Analysis-Ready {{RNA}}-Seq Expression Data}
}

@article{Conesa2006,
  abstract = {MOTIVATION: Multi-series time-course microarray experiments are useful approaches for exploring biological processes. In this type of experiments, the researcher is frequently interested in studying gene expression changes along time and in evaluating trend differences between the various experimental groups. The large amount of data, multiplicity of experimental conditions and the dynamic nature of the experiments poses great challenges to data analysis. RESULTS: In this work, we propose a statistical procedure to identify genes that show different gene expression profiles across analytical groups in time-course experiments. The method is a two-regression step approach where the experimental groups are identified by dummy variables. The procedure first adjusts a global regression model with all the defined variables to identify differentially expressed genes, and in second a variable selection strategy is applied to study differences between groups and to find statistically significant different profiles. The methodology is illustrated on both a real and a simulated microarray dataset.},
  author = {Conesa, Ana and Nueda, Mar\'ia Jos\'e and Ferrer, Alberto and Tal\'on, Manuel},
  date = {2006-05-01},
  doi = {10/ffrtqj},
  eprint = {16481333},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Conesa et al. - 2006 - maSigPro a method to identify significantly diffe.pdf},
  issn = {1367-4803},
  journaltitle = {Bioinformatics (Oxford, England)},
  keywords = {Algorithms,Computer Simulation,Gene Expression,Gene Expression Profiling,Gene Expression Profiling: methods,Gene Expression: physiology,Models; Genetic,Models; Statistical,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,Software,Time Factors},
  number = {9},
  pages = {1096-102},
  title = {{{maSigPro}}: A Method to Identify Significantly Differential Expression Profiles in Time-Course Microarray Experiments.},
  volume = {22}
}

@article{Conesa2016,
  abstract = {RNA-sequencing (RNA-seq) has a wide variety of applications, but no single analysis pipeline can be used in all cases. We review all of the major steps in RNA-seq data analysis, including experimental design, quality control, read alignment, quantification of gene and transcript levels, visualization, differential gene expression, alternative splicing, functional analysis, gene fusion detection and eQTL mapping. We highlight the challenges associated with each step. We discuss the analysis of small RNAs and the integration of RNA-seq with other functional genomics techniques. Finally, we discuss the outlook for novel technologies that are changing the state of the art in transcriptomics.},
  author = {Conesa, Ana and Madrigal, Pedro and Tarazona, Sonia and Gomez-Cabrero, David and Cervera, Alejandra and McPherson, Andrew and Szcze\'sniak, Michal Wojciech and Gaffney, Daniel J. and Elo, Laura L. and Zhang, Xuegong and Mortazavi, Ali},
  date = {2016-01-26},
  doi = {10/f3vhpj},
  eprint = {26813401},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Conesa et al. - 2016 - A survey of best practices for RNA-seq data analys.pdf},
  issn = {1474760X},
  journaltitle = {Genome Biology},
  number = {1},
  title = {A Survey of Best Practices for {{RNA}}-Seq Data Analysis},
  volume = {17}
}

@article{Cordero2012,
  abstract = {BACKGROUND: Massive Parallel Sequencing methods (MPS) can extend and improve the knowledge obtained by conventional microarray technology, both for mRNAs and short non-coding RNAs, e.g. miRNAs. The processing methods used to extract and interpret the information are an important aspect of dealing with the vast amounts of data generated from short read sequencing. Although the number of computational tools for MPS data analysis is constantly growing, their strengths and weaknesses as part of a complex analytical pipe-line have not yet been well investigated. PRIMARY FINDINGS: A benchmark MPS miRNA dataset, resembling a situation in which miRNAs are spiked in biological replication experiments was assembled by merging a publicly available MPS spike-in miRNAs data set with MPS data derived from healthy donor peripheral blood mononuclear cells. Using this data set we observed that short reads counts estimation is strongly under estimated in case of duplicates miRNAs, if whole genome is used as reference. Furthermore, the sensitivity of miRNAs detection is strongly dependent by the primary tool used in the analysis. Within the six aligners tested, specifically devoted to miRNA detection, SHRiMP and MicroRazerS show the highest sensitivity. Differential expression estimation is quite efficient. Within the five tools investigated, two of them (DESseq, baySeq) show a very good specificity and sensitivity in the detection of differential expression. CONCLUSIONS: The results provided by our analysis allow the definition of a clear and simple analytical optimized workflow for miRNAs digital quantitative analysis.},
  author = {Cordero, Francesca and Beccuti, Marco and Arigoni, Maddalena and Donatelli, Susanna and a Calogero, Raffaele},
  date = {2012-01},
  doi = {10/ggcxj8},
  eprint = {22363693},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Cordero et al. - 2012 - Optimizing a massive parallel sequencing workflow .pdf},
  issn = {1932-6203},
  journaltitle = {PloS one},
  keywords = {Algorithms,Databases; Genetic,Gene Expression Profiling,Gene Expression Regulation,Genome; Human,Genome; Human: genetics,High-Throughput Nucleotide Sequencing,High-Throughput Nucleotide Sequencing: methods,Humans,MicroRNAs,MicroRNAs: genetics,MicroRNAs: metabolism,Reference Standards,ROC Curve,Sample Size,Sequence Alignment,Software,Workflow},
  number = {2},
  pages = {e31630},
  title = {Optimizing a Massive Parallel Sequencing Workflow for Quantitative {{miRNA}} Expression Analysis.},
  volume = {7}
}

@article{Crane2012,
  author = {Crane, Brian R},
  date = {2012-07-13},
  doi = {10/ggcxj9},
  eprint = {22798591},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Crane - 2012 - Biochemistry. Nature's intricate clockwork..pdf},
  issn = {1095-9203},
  journaltitle = {Science (New York, N.Y.)},
  keywords = {Animals,ARNTL Transcription Factors,ARNTL Transcription Factors: chemistry,Circadian Rhythm,CLOCK Proteins,CLOCK Proteins: chemistry,Humans,Transcriptional Activation},
  number = {6091},
  pages = {165-6},
  title = {Biochemistry. {{Nature}}'s Intricate Clockwork.},
  volume = {337}
}

@article{crawfordGenomewideMappingDNase2006,
  abstract = {A major goal in genomics is to understand how genes are regulated in different tissues, stages of development, diseases, and species. Mapping DNase I hypersensitive (HS) sites within nuclear chromatin is a powerful and well-established method of identifying many different types of regulatory elements, but in the past it has been limited to analysis of single loci. We have recently described a protocol to generate a genome-wide library of DNase HS sites. Here, we report high-throughput analysis, using massively parallel signature sequencing (MPSS), of 230,000 tags from a DNase library generated from quiescent human CD4+ T cells. Of the tags that uniquely map to the genome, we identified 14,190 clusters of sequences that group within close proximity to each other. By using a real-time PCR strategy, we determined that the majority of these clusters represent valid DNase HS sites. Approximately 80\% of these DNase HS sites uniquely map within one or more annotated regions of the genome believed to contain regulatory elements, including regions 2 kb upstream of genes, CpG islands, and highly conserved sequences. Most DNase HS sites identified in CD4+ T cells are also HS in CD8+ T cells, B cells, hepatocytes, human umbilical vein endothelial cells (HUVECs), and HeLa cells. However, {$\sim$}10\% of the DNase HS sites are lymphocyte specific, indicating that this procedure can identify gene regulatory elements that control cell type specificity. This strategy, which can be applied to any cell line or tissue, will enable a better understanding of how chromatin structure dictates cell function and fate.},
  author = {Crawford, Gregory E. and Holt, Ingeborg E. and Whittle, James and Webb, Bryn D. and Tai, Denise and Davis, Sean and Margulies, Elliott H. and Chen, YiDong and Bernat, John A. and Ginsburg, David and Zhou, Daixing and Luo, Shujun and Vasicek, Thomas J. and Daly, Mark J. and Wolfsberg, Tyra G. and Collins, Francis S.},
  date = {2006-01-01},
  doi = {10/b8cpkk},
  eprint = {16344561},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Crawford et al. - 2006 - Genome-wide mapping of DNase hypersensitive sites .pdf;/Users/ryan/Zotero/storage/S9XKYHB6/123.html},
  issn = {1088-9051, 1549-5469},
  journaltitle = {Genome Research},
  langid = {english},
  number = {1},
  pages = {123-131},
  shortjournal = {Genome Res.},
  title = {Genome-Wide Mapping of {{DNase}} Hypersensitive Sites Using Massively Parallel Signature Sequencing ({{MPSS}})},
  volume = {16}
}

@article{Cui2003,
  abstract = {Extracting biological information from microarray data requires appropriate statistical methods. The simplest statistical method for detecting differential expression is the t test, which can be used to compare two conditions when there is replication of samples. With more than two conditions, analysis of variance (ANOVA) can be used, and the mixed ANOVA model is a general and powerful approach for microarray experiments with multiple factors and/or several sources of variation.},
  author = {Cui, Xiangqin and a Churchill, Gary},
  date = {2003-01},
  eprint = {12702200},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Cui and Churchill - 2003 - Statistical tests for differential expression in c.pdf},
  issn = {1465-6914},
  journaltitle = {Genome biology},
  keywords = {Analysis of Variance,Gene Expression Profiling,Gene Expression Profiling: statistics & numerical,Matched-Pair Analysis,Oligonucleotide Array Sequence Analysis,Statistics as Topic},
  number = {4},
  pages = {210},
  title = {Statistical Tests for Differential Expression in {{cDNA}} Microarray Experiments.},
  url = {http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=154570&tool=pmcentrez&rendertype=abstract},
  volume = {4}
}

@article{Dabney2007,
  abstract = {Nearest-centroid classifiers have recently been successfully employed in high-dimensional applications, such as in genomics. A necessary step when building a classifier for high-dimensional data is feature selection. Feature selection is frequently carried out by computing univariate scores for each feature individually, without consideration for how a subset of features performs as a whole. We introduce a new feature selection approach for high-dimensional nearest centroid classifiers that instead is based on the theoretically optimal choice of a given number of features, which we determine directly here. This allows us to develop a new greedy algorithm to estimate this optimal nearest-centroid classifier with a given number of features. In addition, whereas the centroids are usually formed from maximum likelihood estimates, we investigate the applicability of high-dimensional shrinkage estimates of centroids. We apply the proposed method to clinical classification based on gene-expression microarrays, demonstrating that the proposed method can outperform existing nearest centroid classifiers.},
  author = {Dabney, Alan R and Storey, John D},
  date = {2007-10-03},
  doi = {10/fg49tj},
  editor = {Zhu, Ji},
  eprint = {17912341},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Dabney and Storey - 2007 - Optimality Driven Nearest Centroid Classification .pdf},
  issn = {1932-6203},
  journaltitle = {PLoS ONE},
  keywords = {Algorithms,Automated,Child,Data Interpretation,Discriminant Analysis,Gene Expression Profiling,Gene Expression Regulation,Genetic Techniques,Genomics,Humans,Leukemia,Lymphoma,Lymphoma: genetics,Models,Neoplastic,Oligonucleotide Array Sequence Analysis,Pattern Recognition,Statistical,Theoretical},
  number = {10},
  pages = {e1002},
  title = {Optimality {{Driven Nearest Centroid Classification}} from {{Genomic Data}}},
  volume = {2}
}

@article{Daily2011,
  abstract = {A central challenge of biology is to map and understand gene regulation on a genome-wide scale. For any given genome, only a small fraction of the regulatory elements embedded in the DNA sequence have been characterized, and there is great interest in developing computational methods to systematically map all these elements and understand their relationships. Such computational efforts, however, are significantly hindered by the overwhelming size of non-coding regions and the statistical variability and complex spatial organizations of regulatory elements and interactions. Genome-wide catalogs of regulatory elements for all model species simply do not yet exist. The MotifMap system uses databases of transcription factor binding motifs, refined genome alignments, and a comparative genomic statistical approach to provide comprehensive maps of candidate regulatory elements encoded in the genomes of model species. The system is used to derive new genome-wide maps for yeast, fly, worm, mouse, and human. The human map contains 519,108 sites for 570 matrices with a False Discovery Rate of 0.1 or less. The new maps are assessed in several ways, for instance using high-throughput experimental ChIP-seq data and AUC statistics, providing strong evidence for their accuracy and coverage. The maps can be usefully integrated with many other kinds of omic data and are available at \textbackslash{}n                    http://motifmap.igb.uci.edu/\textbackslash{}n                    \textbackslash{}n                  . MotifMap and its integration with other data provide a foundation for analyzing gene regulation on a genome-wide scale, and for automatically generating regulatory pathways and hypotheses. The power of this approach is demonstrated and discussed using the P53 apoptotic pathway and the Gli hedgehog pathways as examples.},
  author = {Daily, Kenneth and Patel, Vishal R and Rigor, Paul and Xie, Xiaohui and Baldi, Pierre and Yilmaz, A and Mejia-Guerra, MK and Kurz, K and Liang, X and Welch, L and Grotewold, E and Gallo, SM and Gerrard, DT and Miner, D and Simich, M and Soye, B Des and Bergman, CM and Halfon, MS and Griffith, OL and Montgomery, SB and Bernier, B and Chu, B and Kasaian, K and Aerts, S and Mahony, S and Sleumer, MC and Bilenky, M and Haeussler, M and Griffith, M and Gallo, SM and Giardine, B and Hooghe, B and Loo, P Van and Blanco, E and Ticoll, A and Lithwick, S and Portales-Casamar, E and Donaldson, IJ and Robertson, G and Wadelius, C and Bleser, P De and Vlieghe, D and Halfon, MS and Wasserman, W and Hardison, R and Bergman, CM and Jones, SJM and Consortium, TORA and Kolchanov, NA and Ignatieva, EV and Ananko, EA and Podkolodnaya, OA and Stepanenko, IL and Merkulova, TI and Pozdnyakov, MA and Podkolodny, NL and Naumochkin, AN and Romashchenko, AG and Xie, X and Rigor, P and Baldi, P and Portales-Casamar, E and Thongjuea, S and Kwon, AT and Arenillas, D and Zhao, X and Valen, E and Yusuf, D and Lenhard, B and Wasserman, WW and Sandelin, A and Matys, V and Fricke, E and Geffers, R and G\"ossling, E and Haubrock, M and Hehl, R and Hornischer, K and Karas, D and Kel, AE and Kel-Margoulis, OV and Kloos, DUU and Land, S and Lewicki-Potapov, B and Michael, H and M\"unch, R and Reuter, I and Rotert, S and Saxel, H and Scheer, M and Thiele, S and Wingender, E and Siepel, A and Bejerano, G and Pedersen, J and Hinrichs, A and Hou, M and Rosenbloom, K and Clawson, H and Spieth, J and Hillier, L and Richards, S and Pollard, KS and Hubisz, MJ and Rosenbloom, KR and Siepel, A and Ettwiller, L and Paten, B and Souren, M and Loosli, F and Wittbrodt, J and Birney, E and Elemento, O and Tavazoie, S and Xie, X and Lu, J and Kulbokas, EJ and Golub, TR and Mootha, V and Lindblad-Toh, K and Lander, ES and Kellis, M and Stark, A and Lin, MF and Kheradpour, P and Pedersen, JS and Parts, L and Carlson, JW and Crosby, MA and Rasmussen, MD and Roy, S and Deoras, AN and Ruby, GG and Brennecke, J and Hodges, E and Hinrichs, AS and Caspi, A and Paten, B and Park, SWW and Han, MV and Maeder, ML and Polansky, BJ and Robson, BE and Aerts, S and van Helden, J and Hassan, B and Gilbert, DG and Eastman, DA and Rice, M and Weir, M and Hahn, MW and Park, Y and Dewey, CN and Pachter, L and Kent, JJ and Haussler, D and Lai, EC and Bartel, DP and Hannon, GJ and Kaufman, TC and Eisen, MB and Clark, AG and Smith, D and Celniker, SE and Gelbart, WM and Kellis, M and Xie, X and Mikkelsen, TS and Gnirke, A and Lindblad-Toh, K and Kellis, M and Lander, ES and Rhead, B and Karolchik, D and Kuhn, RM and Hinrichs, AS and Zweig, AS and Fujita, PA and Diekhans, M and Smith, KE and Rosenbloom, KR and Raney, BJ and Pohl, A and Pheasant, M and Meyer, LR and Learned, K and Hsu, F and Hillman-Jackson, J and Harte, RA and Giardine, B and Dreszer, TR and Clawson, H and Barber, GP and Haussler, D and Kent, WJ and Blanchette, M and Kent, WJ and Riemer, C and Elnitski, L and Smit, AFA and Roskin, KM and Baertsch, R and Rosenbloom, K and Clawson, H and Green, ED and Haussler, D and Miller, W and Johnson, D and Mortazavi, A and Myers, R and Wold, B and Wei, C and Wu, Q and Vega, V and Chiu, K and Ng, P and Zhang, T and Shahab, A and Yong, H and Fu, Y and Weng, Z and Robertson, G and Hirst, M and Bainbridge, M and Bilenky, M and Zhao, Y and Zeng, T and Euskirchen, G and Bernier, B and Varhol, R and Delaney, A and Zeller, KI and Zhao, X and Lee, CWH and Chiu, KP and Yao, F and Yustein, JT and Ooi, HS and Orlov, YL and Shahab, A and Yong, HC and Fu, Y and Weng, Z and Kuznetsov, VA and Sung, WK and Ruan, Y and Dang, CV and Wei, CL and Lim, C and Yao, F and Wong, J and George, J and Xu, H and Chiu, K and Sung, W and Lipovich, L and Vega, V and Chen, J and Kim, T and Abdullaev, Z and Smith, A and Ching, K and Loukinov, D and Green, R and Zhang, M and Lobanenkov, V and Ren, B and Pruitt, KD and Tatusova, T and Klimke, W and Maglott, DR and Flicek, P and Amode, MR and Barrell, D and Beal, K and Brent, S and Chen, Y and Clapham, P and Coates, G and Fairley, S and Fitzgerald, S and Gordon, L and Hendrix, M and Hourlier, T and Johnson, N and K\"ah\"ari, A and Keefe, D and Keenan, S and Kinsella, R and Kokocinski, F and Kulesha, E and Larsson, P and Longden, I and McLaren, W and Overduin, B and Pritchard, B and Riat, HS and Rios, D and Ritchie, GRS and Ruffier, M and Schuster, M and Sobral, D and Spudich, G and Tang, YA and Trevanion, S and Vandrovcova, J and Vilella, AJ and White, S and Wilder, SP and Zadissa, A and Zamora, J and Aken, BL and Birney, E and Cunningham, F and Dunham, I and Durbin, R and Fern\'andez-Suarez, XM and Herrero, J and Hubbard, TJP and Parker, A and Proctor, G and Vogel, J and Searle, SMJ and Ashburner, M and Ball, CA and Blake, JA and Botstein, D and Butler, H and Cherry, JM and Davis, AP and Dolinski, K and Dwight, SS and Eppig, JT and Harris, MA and Hill, DP and Issel-Tarver, L and Kasarskis, A and Lewis, S and Matese, JC and Richardson, JE and Ringwald, M and Rubin, GM and Sherlock, G and Drysdale, R and {t}, FC and {project}, S and D'Souza, UM and Craig, IW and Sherry, ST and Ward, MH and Kholodov, M and Baker, J and Phan, L and Smigielski, EM and Sirotkin, K and Kanehisa, M and Goto, S and Ellisen, LW and Ramsayer, KD and Johannessen, CM and Yang, A and Beppu, H and Minda, K and Oliner, JD and McKeon, F and Haber, DA and Kawase, T and Ohki, R and Shibata, T and Tsutsumi, S and Kamimura, N and Inazawa, J and Ohta, T and Ichikawa, H and Aburatani, H and Tashiro, F and Taya, Y and Matise, MP and Joyner, AL and Mullor, JL and Dahmane, N and Sun, T and i Altaba, A Ruiz and Jiang, C and Xuan, Z and Zhao, F and Zhang, MQ and Weiner, HL and Bakst, R and Hurlbert, MS and Ruggiero, J and Ahn, E and Lee, WS and Stephen, D and Zagzag, D and Joyner, AL and Turnbull, DH and Hu, MC and Mo, R and Bhella, S and Wilson, CW and Chuang, PT and Hui, Cc and Rosenblum, ND and Vokes, SA and Ji, H and McCuine, S and Tenzen, T and Giles, S and Zhong, S and Longabaugh, WJR and Davidson, EH and Wong, WH and McMahon, AP and Santagati, F and Abe, K and Schmidt, V and Schmitt-John, T and Suzuki, M and Yamamura, Ki and Imai, K and Prasad, TS Keshava and Goel, R and Kandasamy, K and Keerthikumar, S and Kumar, S and Mathivanan, S and Telikicherla, D and Raju, R and Shafreen, B and Venugopal, A and Balakrishnan, L and Marimuthu, A and Banerjee, S and Somanathan, DS and Sebastian, A and Rani, S and Ray, S and Kishore, CJ Harrys and Kanth, S and Ahmed, M and Kashyap, MK and Mohmood, R and Ramachandra, YL and Krishna, V and Rahiman, BA and Mohan, S and Ranganathan, P and Ramabadran, S and Chaerkady, R and Pandey, A and Stark, C and Breitkreutz, BJ and Reguly, T and Boucher, L and Breitkreutz, A and Tyers, M and He, L and Hannon, GJ and Barrett, T and Troup, DB and Wilhite, SE and Ledoux, P and Rudnev, D and Evangelista, C and Kim, IF and Soboleva, A and Tomashevsky, M and Marshall, KA and Phillippy, KH and Sherman, PM and Muertter, RN and Edgar, R and Consortium, TEP and Simonis, M and Klous, P and Splinter, E and Moshkin, Y and Willemsen, R and de Wit, E and van Steensel, B and de Laat, W and Lieberman-Aiden, E and van Berkum, NL and Williams, L and Imakaev, M and Ragoczy, T and Telling, A and Amit, I and Lajoie, BR and Sabo, PJ and Dorschner, MO and Sandstrom, R and Bernstein, B and Bender, MA and Groudine, M and Gnirke, A and Stamatoyannopoulos, J and Mirny, LA and Lander, ES and Dekker, J and Fullwood, MJ and Wei, CL and Liu, ET and Ruan, Y and Hakim, O and Sung, MH and Voss, TC and Splinter, E and John, S and Sabo, PJ and Thurman, RE and Stamatoyannopoulos, JA and de Laat, W and Hager, GL and Ferrucci, D and Schmidt, D and Wilson, MD and Ballester, B and Schwalie, PC and Brown, GD and Marshall, A and Kutter, C and Watt, S and Martinez-Jimenez, CP and Mackay, S and Talianidis, I and Flicek, P and Odom, DT and King, DC and Taylor, J and Zhang, Y and Cheng, Y and Lawson, HA and Martin, J and Analysis, MS and Chiaromonte, F and Miller, W and Hardison, RC},
  date = {2011},
  doi = {10/fxp55k},
  eprint = {22208852},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Daily et al. - 2011 - MotifMap integrative genome-wide maps of regulato.pdf},
  isbn = {1471-2105},
  issn = {1471-2105},
  journaltitle = {BMC Bioinformatics},
  keywords = {Algorithms,Bioinformatics,Combinatorial Libraries,Computational Biology/Bioinformatics,Computer Appl. in Life Sciences,Microarrays},
  number = {1},
  pages = {495},
  title = {{{MotifMap}}: Integrative Genome-Wide Maps of Regulatory Motif Sites for Model Species},
  volume = {12}
}

@book{darwinOriginSpeciesMeans1999,
  author = {Darwin, Charles},
  date = {1999-12-01},
  file = {/Users/ryan/Documents/Zotero Library/Darwin - 1999 - The Origin of Species by Means of Natural Selectio.pdf;/Users/ryan/Zotero/storage/TRX8QJY7/2009-h.html},
  keywords = {Evolution (Biology),Natural selection},
  langid = {english},
  title = {The {{Origin}} of {{Species}} by {{Means}} of {{Natural Selection Or}}, the {{Preservation}} of {{Favoured Races}} in the {{Struggle}} for {{Life}}, 6th {{Edition}}},
  url = {http://www.gutenberg.org/ebooks/2009},
  urldate = {2019-11-14}
}

@article{DeCaprio2007,
  author = {DeCaprio, D and Vinson, JP},
  date = {2007},
  doi = {10/c825z6},
  file = {/Users/ryan/Documents/Zotero Library/DeCaprio and Vinson - 2007 - Conrad gene prediction using conditional random f.pdf},
  journaltitle = {Genome \ldots{}},
  pages = {1389-1398},
  title = {Conrad: Gene Prediction Using Conditional Random Fields}
}

@article{Delhomme2012,
  abstract = {RNA sequencing is becoming a standard for expression profiling experiments and many tools have been developed in the past few years to analyze RNA-Seq data. Numerous 'Bioconductor' packages are available for next-generation sequencing data loading in R, e.g. ShortRead and Rsamtools as well as to perform differential gene expression analyses, e.g. DESeq and edgeR. However, the processing tasks lying in between these require the precise interplay of many Bioconductor packages, e.g. Biostrings, IRanges or external solutions are to be sought.},
  author = {Delhomme, Nicolas and Padioleau, Isma\"el and Furlong, Eileen E and Steinmetz, Lars M},
  date = {2012-10-01},
  doi = {10/f3p4q3},
  eprint = {22847932},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Delhomme et al. - 2012 - easyRNASeq a bioconductor package for processing .pdf},
  issn = {1367-4811},
  journaltitle = {Bioinformatics (Oxford, England)},
  number = {19},
  pages = {2532-3},
  title = {{{easyRNASeq}}: A Bioconductor Package for Processing {{RNA}}-{{Seq}} Data.},
  volume = {28}
}

@article{Demissie2008,
  abstract = {MOTIVATION: In searching for differentially expressed (DE) genes in microarray data, we often observe a fraction of the genes to have unequal variability between groups. This is not an issue in large samples, where a valid test exists that uses individual variances separately. The problem arises in the small-sample setting, where the approximately valid Welch test lacks sensitivity, while the more sensitive moderated t-test assumes equal variance.

METHODS: We introduce a moderated Welch test (MWT) that allows unequal variance between groups. It is based on (i) weighting of pooled and unpooled standard errors and (ii) improved estimation of the gene-level variance that exploits the information from across the genes.

RESULTS: When a non-trivial proportion of genes has unequal variability, false discovery rate (FDR) estimates based on the standard t and moderated t-tests are often too optimistic, while the standard Welch test has low sensitivity. The MWT is shown to (i) perform better than the standard t, the standard Welch and the moderated t-tests when the variances are unequal between groups and (ii) perform similarly to the moderated t, and better than the standard t and Welch tests when the group variances are equal. These results mean that MWT is more reliable than other existing tests over wider range of data conditions.

AVAILABILITY: R package to perform MWT is available at http://www.meb.ki.se/\textasciitilde{}yudpaw},
  author = {Demissie, Meaza and Mascialino, Barbara and Calza, Stefano and Pawitan, Yudi},
  date = {2008-05-01},
  doi = {10/fnk8vv},
  eprint = {18344518},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Demissie et al. - 2008 - Unequal group variances in microarray data analyse.pdf},
  issn = {1367-4811},
  journaltitle = {Bioinformatics (Oxford, England)},
  keywords = {Algorithms,Analysis of Variance,Artifacts,Data Interpretation; Statistical,Gene Expression Profiling,Gene Expression Profiling: methods,Genetic Variation,Genetic Variation: genetics,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,Reproducibility of Results,Sample Size,Sensitivity and Specificity},
  number = {9},
  pages = {1168-74},
  title = {Unequal Group Variances in Microarray Data Analyses.},
  volume = {24}
}

@article{Dey2018,
  abstract = {Estimation of correlation matrices and correlations among variables is a ubiquitous problem in statistics. In many cases -- especially when the number of observations is small relative to the number of variables -- some kind of shrinkage or regularization is necessary to improve estimation accuracy. Here, we propose an Empirical Bayes shrinkage approach, CorShrink, which adaptively learns how much to shrink correlations by combining information across all pairs of variables. One key feature of CorShrink, which distinguishes it from most existing methods, is its flexibility in dealing with missing data. Indeed, CorShrink explicitly accounts for varying amounts of missingness among pairs of variables. Numerical studies suggest CorShrink is competitive with other popular correlation shrinkage methods, even when there is no missing data. We illustrate CorShrink on gene expression data from GTEx project, which suffers from extensive missing observations, and where existing methods struggle. We also illustrate its flexibility by applying it to estimate cosine similarities between word vectors from word2vec models, thereby generating more accurate word similarity rankings.},
  author = {Dey, Kushal K. and Stephens, Matthew},
  date = {2018},
  doi = {10/gdtg3j},
  file = {/Users/ryan/Documents/Zotero Library/Dey and Stephens - 2018 - CorShrink  Empirical Bayes shrinkage estimation o.pdf},
  journaltitle = {bioRxiv},
  keywords = {correlation shrinkage,empirical bayes,genomics,missing data},
  pages = {368316},
  title = {{{CorShrink}} : {{Empirical Bayes}} Shrinkage Estimation of Correlations, with Applications}
}

@report{deyVisualizingStructureRNAseq2016,
  abstract = {Abstract
          Grade of membership models, also known as ``admixture models'', ``topic models'' or ``Latent Dirichlet Allocation'', are a generalization of cluster models that allow each sample to have membership in multiple clusters. These models are widely used in population genetics to model admixed individuals who have ancestry from multiple ``populations'', and in natural language processing to model documents having words from multiple ``topics''. Here we illustrate the potential for these models to cluster samples of RNA-seq gene expression data, measured on either bulk samples or single cells. We also provide methods to help interpret the clusters, by identifying genes that are distinctively expressed in each cluster. By applying these methods to several example RNA-seq applications we demonstrate their utility in identifying and summarizing structure and heterogeneity. Applied to data from the GTEx project on 53 human tissues, the approach highlights similarities among biologically-related tissues and identifies distinctively-expressed genes that recapitulate known biology. Applied to single-cell expression data from mouse preimplantation embryos, the approach highlights both discrete and continuous variation through early embryonic development stages, and highlights genes involved in a variety of relevant processes \textendash{} from germ cell development, through compaction and morula formation, to the formation of inner cell mass and trophoblast at the blastocyst stage. The methods are implemented in the Bioconductor package CountClust.
          
            Author Summary
            Gene expression profile of a biological sample (either from single cells or pooled cells) results from a complex interplay of multiple related biological processes. Consequently, for example, distal tissue samples may share a similar gene expression profile through some common underlying biological processes. Our goal here is to illustrate that grade of membership (GoM) models \textendash{} an approach widely used in population genetics to cluster admixed individuals who have ancestry from multiple populations \textendash{} provide an attractive approach for clustering biological samples of RNA sequencing data. The GoM model allows each biological sample to have partial memberships in multiple biologically-distinct clusters, in contrast to traditional clustering methods that partition samples into distinct subgroups. We also provide methods for identifying genes that are distinctively expressed in each cluster to help biologically interpret the results. Applied to a dataset of 53 human tissues, the GoM approach highlights similarities among biologically-related tissues and identifies distinctively-expressed genes that recapitulate known biology. Applied to gene expression data of single cells from mouse preimplantation embryos, the approach highlights both discrete and continuous variation through early embryonic development stages, and genes involved in a variety of relevant processes. Our study highlights the potential of GoM models for elucidating biological structure in RNA-seq gene expression data.},
  author = {Dey, Kushal K and Joyce Hsiao, Chiaowen and Stephens, Matthew},
  date = {2016-05-04},
  doi = {10/dd8r},
  file = {/Users/ryan/Documents/Zotero Library/Dey et al. - 2016 - Visualizing the Structure of RNA-seq Expression Da.pdf},
  ids = {DeyKushalHsiaoJoyce2016},
  institution = {{Genetics}},
  keywords = {⛔ No DOI found},
  langid = {english},
  title = {Visualizing the {{Structure}} of {{RNA}}-Seq {{Expression Data}} Using {{Grade}} of {{Membership Models}}},
  type = {preprint},
  url = {http://biorxiv.org/lookup/doi/10.1101/051631},
  urldate = {2019-11-14}
}

@article{Diego,
  author = {Diego, San and Cruz, Santa},
  file = {/Users/ryan/Documents/Zotero Library/Diego and Cruz - Supporting Online Material ( SOM ) Materials and M.pdf},
  number = {2},
  title = {Supporting {{Online Material}} ( {{SOM}} ) {{Materials}} and {{Methods}}},
  volume = {001}
}

@article{Dillies2012,
  abstract = {During the last 3 years, a number of approaches for the normalization of RNA sequencing data have emerged in the literature, differing both in the type of bias adjustment and in the statistical strategy adopted.However, as data continue to accumulate, there has been no clear consensus on the appropriate normalization method to be used or the impact of a chosen method on the downstream analysis. In this work, we focus on a comprehensive comparison of seven recently proposed normalization methods for the differential analysis of RNA-seq data, with an emphasis on the use of varied real and simulated datasets involving different species and experimental designs to represent data characteristics commonly observed in practice. Based on this comparison study, we propose practical recommendations on the appropriate normalization method to be used and its impact on the differential analysis of RNA-seq data. \textcopyright{} The Author 2012. Published by Oxford University Press.},
  author = {Dillies, M.-A. and Rau, Andrea and Aubert, Julie and Hennequet-Antier, Christelle and Jeanmougin, Marine and Servant, Nicolas and Keime, C\'eline and Marot, G. and Castel, David and Estelle, Jordi and Guernec, Gregory and Jagla, Bernd and Jouneau, Luc and Laloe, D. and Le Gall, Caroline and Schaeffer, B. and Le Crom, St\'ephane and Guedj, Micka\"el and Jaffrezic, F.},
  date = {2013-11-01},
  doi = {10/f5hx7v},
  eprint = {22988256},
  eprinttype = {pmid},
  isbn = {1477-4054 (Electronic)\textbackslash{}r1467-5463 (Linking)},
  issn = {1467-5463},
  journaltitle = {Briefings in Bioinformatics},
  keywords = {\#nosource,differential analysis,Differential analysis,high-throughput sequencing,High-throughput sequencing,normalization,Normalization,rna-seq,RNA-seq},
  number = {6},
  pages = {671-683},
  title = {A Comprehensive Evaluation of Normalization Methods for {{Illumina}} High-Throughput {{RNA}} Sequencing Data Analysis},
  volume = {14}
}

@article{Dobbin2005,
  abstract = {Determining sample sizes for microarray experiments is important but the complexity of these experiments, and the large amounts of data they produce, can make the sample size issue seem daunting, and tempt researchers to use rules of thumb in place of formal calculations based on the goals of the experiment. Here we present formulae for determining sample sizes to achieve a variety of experimental goals, including class comparison and the development of prognostic markers. Results are derived which describe the impact of pooling, technical replicates and dye-swap arrays on sample size requirements. These results are shown to depend on the relative sizes of different sources of variability. A variety of common types of experimental situations and designs used with single-label and dual-label microarrays are considered. We discuss procedures for controlling the false discovery rate. Our calculations are based on relatively simple yet realistic statistical models for the data, and provide straightforward sample size calculation formulae.},
  author = {Dobbin, Kevin and Simon, Richard},
  date = {2005-01},
  doi = {10/fw59gn},
  eprint = {15618525},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Dobbin and Simon - 2005 - Sample size determination in microarray experiment.pdf},
  issn = {1465-4644},
  journaltitle = {Biostatistics (Oxford, England)},
  keywords = {Biological Markers,False Positive Reactions,Humans,Linear Models,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,Predictive Value of Tests,Prognosis,Sample Size},
  number = {1},
  pages = {27-38},
  title = {Sample Size Determination in Microarray Experiments for Class Comparison and Prognostic Classification.},
  volume = {6}
}

@book{Dobin2013,
  author = {Dobin, Alex},
  date = {2013},
  file = {/Users/ryan/Documents/Zotero Library/Dobin - 2013 - STAR manual.pdf},
  pagetotal = {1-21},
  title = {{{STAR}} Manual}
}

@article{dobinSTARUltrafastUniversal2013,
  abstract = {MOTIVATION: Accurate alignment of high-throughput RNA-seq data is a challenging and yet unsolved problem because of the non-contiguous transcript structure, relatively short read lengths and constantly increasing throughput of the sequencing technologies. Currently available RNA-seq aligners suffer from high mapping error rates, low mapping speed, read length limitation and mapping biases. RESULTS: To align our large (exceeding 80 billon reads) ENCODE Transcriptome RNA-seq dataset we developed the Spliced Transcripts Alignment to a Reference (STAR) software based on a previously un-described RNA-seq alignment algorithm which utilizes sequential maximum mappable seed search in uncompressed suffix arrays followed by seed clustering and stitching procedure. STAR outperforms other aligners by more than a factor of 50 in mapping speed, aligning to the human genome 550 Million 2x76bp paired-end reads per hour on a modest 12-core server, while at the same time improving alignment sensitivity and precision. In addition to unbiased de novo detection of canonical junctions, STAR can discover non-canonical splices and chimeric (fusion) transcripts, and is also capable of mapping full length RNA sequences. Using Roche 454 sequencing of RT-PCR amplicons, we experimentally validated 1,960 novel intergenic splice junctions with an 80-90\% success rate, corroborating the high precision of the STAR mapping strategy.Implementation and AVAILABILITY: STAR is implemented as a standalone C++ code. STAR is free open source software distributed under GPLv3 license and can be downloaded from http://code.google.com/p/rna-star/ CONTACT: dobin@cshl.edu.},
  author = {Dobin, Alexander and Davis, Carrie A. and Schlesinger, Felix and Drenkow, Jorg and Zaleski, Chris and Jha, Sonali and Batut, Philippe and Chaisson, Mark and Gingeras, Thomas R.},
  date = {2013-01},
  doi = {10/f4h523},
  file = {/Users/ryan/Documents/Zotero Library/Dobin et al. - 2013 - STAR ultrafast universal RNA-seq aligner.pdf;/Users/ryan/Documents/Zotero Library/Dobin et al. - 2013 - STAR ultrafast universal RNA-seq aligner2.pdf},
  ids = {Dobin2013b},
  issn = {1460-2059, 1367-4803},
  journaltitle = {Bioinformatics},
  keywords = {Algorithms,Cluster Analysis,Gene Expression Profiling,Genome,Human,Humans,RNA,RNA Splicing,RNA: methods,Sequence Alignment,Sequence Alignment: methods,Sequence Analysis,Software},
  langid = {english},
  number = {1},
  pages = {15-21},
  shorttitle = {{{STAR}}},
  title = {{{STAR}}: Ultrafast Universal {{RNA}}-Seq Aligner},
  volume = {29}
}

@article{Dorff2013,
  abstract = {We present GobyWeb, a web-based system that facilitates the management and analysis of high-throughput sequencing (HTS) projects. The software provides integrated support for a broad set of HTS analyses and offers a simple plugin extension mechanism. Analyses currently supported include quantification of gene expression for messenger and small RNA sequencing, estimation of DNA methylation (i.e., reduced bisulfite sequencing and whole genome methyl-seq), or the detection of pathogens in sequenced data. In contrast to previous analysis pipelines developed for analysis of HTS data, GobyWeb requires significantly less storage space, runs analyses efficiently on a parallel grid, scales gracefully to process tens or hundreds of multi-gigabyte samples, yet can be used effectively by researchers who are comfortable using a web browser. We conducted performance evaluations of the software and found it to either outperform or have similar performance to analysis programs developed for specialized analyses of HTS data. We found that most biologists who took a one-hour GobyWeb training session were readily able to analyze RNA-Seq data with state of the art analysis tools. GobyWeb can be obtained at http://gobyweb.campagnelab.org and is freely available for non-commercial use. GobyWeb plugins are distributed in source code and licensed under the open source LGPL3 license to facilitate code inspection, reuse and independent extensions http://github.com/CampagneLaboratory/gobyweb2-plugins.},
  author = {Dorff, Kevin C and Chambwe, Nyasha and Zeno, Zachary and Simi, Manuele and Shaknovich, Rita and Campagne, Fabien},
  date = {2013-01},
  doi = {10/ggcxkb},
  eprint = {23936070},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Dorff et al. - 2013 - GobyWeb Simplified Management and Analysis of Gen.pdf},
  issn = {1932-6203},
  journaltitle = {PloS one},
  number = {7},
  pages = {e69666},
  title = {{{GobyWeb}}: {{Simplified Management}} and {{Analysis}} of {{Gene Expression}} and {{DNA Methylation Sequencing Data}}.},
  volume = {8}
}

@article{Dougherty2010,
  abstract = {Classification in bioinformatics often suffers from small samples in conjunction with large numbers of features, which makes error estimation problematic. When a sample is small, there is insufficient data to split the sample and the same data are used for both classifier design and error estimation. Error estimation can suffer from high variance, bias, or both. The problem of choosing a suitable error estimator is exacerbated by the fact that estimation performance depends on the rule used to design the classifier, the feature-label distribution to which the classifier is to be applied, and the sam- ple size. This paper reviews the performance of training-sample error estimators with respect to several criteria: estimation accuracy, variance, bias, correlation with the true error, regression on the true error, and accuracy in ranking feature sets. A number of error estimators are considered: resubstitution, leave-one-out cross-validation, 10-fold cross-validation, bol- stered resubstitution, semi-bolstered resubstitution, .632 bootstrap, .632+ bootstrap, and optimal bootstrap. It illustrates these performance criteria for certain models and for two real data sets, referring to the literature for more extensive appli- cations of these criteria. The results given in the present paper are consistent with those in the literature and lead to two conclusions: (1) much greater effort needs to be focused on error estimation, and (2) owing to the generally poor perform- ance of error estimators on small samples, for a conclusion based on a small-sample error estimator to be considered valid, it should be supported by evidence that the estimator in question can be expected to perform sufficiently well under the circumstances to justify the conclusion.},
  author = {Dougherty, Edward R. and Sima, Chao and Hanczar, Blaise and Braga-Neto, Ulisses M.},
  date = {2010},
  doi = {10/cwh34z},
  file = {/Users/ryan/Documents/Zotero Library/Dougherty et al. - 2010 - Performance of Error Estimators for Classification.pdf},
  isbn = {9798628896},
  issn = {15748936},
  journaltitle = {Current Bioinformatics},
  keywords = {classification,epistemology,error estimation,validity},
  number = {1},
  pages = {53-67},
  title = {Performance of {{Error Estimators}} for {{Classification}}},
  volume = {5}
}

@article{Du2010,
  abstract = {BACKGROUND: High-throughput profiling of DNA methylation status of CpG islands is crucial to understand the epigenetic regulation of genes. The microarray-based Infinium methylation assay by Illumina is one platform for low-cost high-throughput methylation profiling. Both Beta-value and M-value statistics have been used as metrics to measure methylation levels. However, there are no detailed studies of their relations and their strengths and limitations.\textbackslash{}n\textbackslash{}nRESULTS: We demonstrate that the relationship between the Beta-value and M-value methods is a Logit transformation, and show that the Beta-value method has severe heteroscedasticity for highly methylated or unmethylated CpG sites. In order to evaluate the performance of the Beta-value and M-value methods for identifying differentially methylated CpG sites, we designed a methylation titration experiment. The evaluation results show that the M-value method provides much better performance in terms of Detection Rate (DR) and True Positive Rate (TPR) for both highly methylated and unmethylated CpG sites. Imposing a minimum threshold of difference can improve the performance of the M-value method but not the Beta-value method. We also provide guidance for how to select the threshold of methylation differences.\textbackslash{}n\textbackslash{}nCONCLUSIONS: The Beta-value has a more intuitive biological interpretation, but the M-value is more statistically valid for the differential analysis of methylation levels. Therefore, we recommend using the M-value method for conducting differential methylation analysis and including the Beta-value statistics when reporting the results to investigators.},
  author = {Du, Pan and Zhang, Xiao and Huang, Chiang-Ching and Jafari, Nadereh and Kibbe, Warren A and Hou, Lifang and Lin, Simon M},
  date = {2010},
  doi = {10/b3p84w},
  eprint = {21118553},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Du et al. - 2010 - Comparison of Beta-value and M-value methods for q.pdf},
  isbn = {1471-2105 (Electronic)\textbackslash{}n1471-2105 (Linking)},
  issn = {1471-2105},
  journaltitle = {BMC bioinformatics},
  keywords = {CpG Islands,Data Interpretation; Statistical,DNA Methylation,Microarray Analysis,Microarray Analysis: methods},
  number = {1},
  pages = {587},
  title = {Comparison of {{Beta}}-Value and {{M}}-Value Methods for Quantifying Methylation Levels by Microarray Analysis.},
  volume = {11}
}

@article{Dunham2012,
  abstract = {The human genome encodes the blueprint of life, but the function of the vast majority of its nearly three billion bases is unknown. The Encyclopedia of DNA Elements (ENCODE) project has systematically mapped regions of transcription, transcription factor association, chromatin structure and histone modification. These data enabled us to assign biochemical functions for 80\% of the genome, in particular outside of the well-studied protein-coding regions. Many discovered candidate regulatory elements are physically associated with one another and with expressed genes, providing new insights into the mechanisms of gene regulation. The newly identified elements also show a statistical correspondence to sequence variants linked to human disease, and can thereby guide interpretation of this variation. Overall, the project provides new insights into the organization and regulation of our genes and genome, and is an expansive resource of functional annotations for biomedical research. \textcopyright{} 2012 Macmillan Publishers Limited. All rights reserved.},
  author = {Dunham, Ian and Kundaje, Anshul and Aldred, Shelley F. and Collins, Patrick J. and Davis, Carrie A. and Doyle, Francis and Epstein, Charles B. and Frietze, Seth and Harrow, Jennifer and Kaul, Rajinder and Khatun, Jainab and Lajoie, Bryan R. and Landt, Stephen G. and Lee, Bum Kyu and Pauli, Florencia and Rosenbloom, Kate R. and Sabo, Peter and Safi, Alexias and Sanyal, Amartya and Shoresh, Noam and Simon, Jeremy M. and Song, Lingyun and Trinklein, Nathan D. and Altshuler, Robert C. and Birney, Ewan and Brown, James B. and Cheng, Chao and Djebali, Sarah and Dong, Xianjun and Ernst, Jason and Furey, Terrence S. and Gerstein, Mark and Giardine, Belinda and Greven, Melissa and Hardison, Ross C. and Harris, Robert S. and Herrero, Javier and Hoffman, Michael M. and Iyer, Sowmya and Kellis, Manolis and Kheradpour, Pouya and Lassmann, Timo and Li, Qunhua and Lin, Xinying and Marinov, Georgi K. and Merkel, Angelika and Mortazavi, Ali and Parker, Stephen C.J. and Reddy, Timothy E. and Rozowsky, Joel and Schlesinger, Felix and Thurman, Robert E. and Wang, Jie and Ward, Lucas D. and Whitfield, Troy W. and Wilder, Steven P. and Wu, Weisheng and Xi, Hualin S. and Yip, Kevin Y. and Zhuang, Jiali and Bernstein, Bradley E. and Green, Eric D. and Gunter, Chris and Snyder, Michael and Pazin, Michael J. and Lowdon, Rebecca F. and Dillon, Laura A.L. and Adams, Leslie B. and Kelly, Caroline J. and Zhang, Julia and Wexler, Judith R. and Good, Peter J. and Feingold, Elise A. and Crawford, Gregory E. and Dekker, Job and Elnitski, Laura and Farnham, Peggy J. and Giddings, Morgan C. and Gingeras, Thomas R. and Guig\'o, Roderic and Hubbard, Timothy J. and Kent, W. James and Lieb, Jason D. and Margulies, Elliott H. and Myers, Richard M. and Stamatoyannopoulos, John A. and Tenenbaum, Scott A. and Weng, Zhiping and White, Kevin P. and Wold, Barbara and Yu, Yanbao and Wrobel, John and Risk, Brian A. and Gunawardena, Harsha P. and Kuiper, Heather C. and Maier, Christopher W. and Xie, Ling and Chen, Xian and Mikkelsen, Tarjei S. and Gillespie, Shawn and Goren, Alon and Ram, Oren and Zhang, Xiaolan and Wang, Li and Issner, Robbyn and Coyne, Michael J. and Durham, Timothy and Ku, Manching and Truong, Thanh and Eaton, Matthew L. and Dobin, Alex and Tanzer, Andrea and Lagarde, Julien and Lin, Wei and Xue, Chenghai and Williams, Brian A. and Zaleski, Chris and R\"oder, Maik and Kokocinski, Felix and Abdelhamid, Rehab F. and Alioto, Tyler and Antoshechkin, Igor and Baer, Michael T. and Batut, Philippe and Bell, Ian and Bell, Kimberly and Chakrabortty, Sudipto and Chrast, Jacqueline and Curado, Joao and Derrien, Thomas and Drenkow, Jorg and Dumais, Erica and Dumais, Jackie and Duttagupta, Radha and Fastuca, Megan and Fejes-Toth, Kata and Ferreira, Pedro and Foissac, Sylvain and Fullwood, Melissa J. and Gao, Hui and Gonzalez, David and Gordon, Assaf and Howald, C\'edric and Jha, Sonali and Johnson, Rory and Kapranov, Philipp and King, Brandon and Kingswood, Colin and Li, Guoliang and Luo, Oscar J. and Park, Eddie and Preall, Jonathan B. and Presaud, Kimberly and Ribeca, Paolo and Robyr, Daniel and Ruan, Xiaoan and Sammeth, Michael and Sandhu, Kuljeet Singh and Schaeffer, Lorain and See, Lei Hoon and Shahab, Atif and Skancke, Jorgen and Suzuki, Ana Maria and Takahashi, Hazuki and Tilgner, Hagen and Trout, Diane and Walters, Nathalie and Wang, Huaien and Hayashizaki, Yoshihide and Reymond, Alexandre and Antonarakis, Stylianos E. and Hannon, Gregory J. and Ruan, Yijun and Carninci, Piero and Sloan, Cricket A. and Learned, Katrina and Malladi, Venkat S. and Wong, Matthew C. and Barber, Galt P. and Cline, Melissa S. and Dreszer, Timothy R. and Heitner, Steven G. and Karolchik, Donna and Kirkup, Vanessa M. and Meyer, Laurence R. and Long, Jeffrey C. and Maddren, Morgan and Raney, Brian J. and Grasfeder, Linda L. and Giresi, Paul G. and Battenhouse, Anna and Sheffield, Nathan C. and Showers, Kimberly A. and London, Darin and Bhinge, Akshay A. and Shestak, Christopher and Schaner, Matthew R. and Kim, Seul Ki and Zhang, Zhuzhu Z. and Mieczkowski, Piotr A. and Mieczkowska, Joanna O. and Liu, Zheng and McDaniell, Ryan M. and Ni, Yunyun and Rashid, Naim U. and Kim, Min Jae and Adar, Sheera and Zhang, Zhancheng and Wang, Tianyuan and Winter, Deborah and Keefe, Damian and Iyer, Vishwanath R. and Zheng, Meizhen and Wang, Ping and Gertz, Jason and Vielmetter, Jost and Partridge, E. Christopher and Varley, Katherine E. and Gasper, Clarke and Bansal, Anita and Pepke, Shirley and Jain, Preti and Amrhein, Henry and Bowling, Kevin M. and Anaya, Michael and Cross, Marie K. and Muratet, Michael A. and Newberry, Kimberly M. and McCue, Kenneth and Nesmith, Amy S. and Fisher-Aylor, Katherine I. and Pusey, Barbara and DeSalvo, Gilberto and Parker, Stephanie L. and Balasubramanian, Sreeram and Davis, Nicholas S. and Meadows, Sarah K. and Eggleston, Tracy and Newberry, J. Scott and Levy, Shawn E. and Absher, Devin M. and Wong, Wing H. and Blow, Matthew J. and Visel, Axel and Pennachio, Len A. and Petrykowska, Hanna M. and Abyzov, Alexej and Aken, Bronwen and Barrell, Daniel and Barson, Gemma and Berry, Andrew and Bignell, Alexandra and Boychenko, Veronika and Bussotti, Giovanni and Davidson, Claire and Despacio-Reyes, Gloria and Diekhans, Mark and Ezkurdia, Iakes and Frankish, Adam and Gilbert, James and Gonzalez, Jose Manuel and Griffiths, Ed and Harte, Rachel and Hendrix, David A. and Hunt, Toby and Jungreis, Irwin and Kay, Mike and Khurana, Ekta and Leng, Jing and Lin, Michael F. and Loveland, Jane and Lu, Zhi and Manthravadi, Deepa and Mariotti, Marco and Mudge, Jonathan and Mukherjee, Gaurab and Notredame, Cedric and Pei, Baikang and Rodriguez, Jose Manuel and Saunders, Gary and Sboner, Andrea and Searle, Stephen and Sisu, Cristina and Snow, Catherine and Steward, Charlie and Tapanari, Electra and Tress, Michael L. and Van Baren, Marijke J. and Washietl, Stefan and Wilming, Laurens and Zadissa, Amonida and Zhang, Zhengdong and Brent, Michael and Haussler, David and Valencia, Alfonso and Addleman, Nick and Alexander, Roger P. and Auerbach, Raymond K. and Balasubramanian, Suganthi and Bettinger, Keith and Bhardwaj, Nitin and Boyle, Alan P. and Cao, Alina R. and Cayting, Philip and Charos, Alexandra and Cheng, Yong and Eastman, Catharine and Euskirchen, Ghia and Fleming, Joseph D. and Grubert, Fabian and Habegger, Lukas and Hariharan, Manoj and Harmanci, Arif and Iyengar, Sushma and Jin, Victor X. and Karczewski, Konrad J. and Kasowski, Maya and Lacroute, Phil and Lam, Hugo and Lamarre-Vincent, Nathan and Lian, Jin and Lindahl-Allen, Marianne and Min, Renqiang and Miotto, Benoit and Monahan, Hannah and Moqtaderi, Zarmik and Mu, Xinmeng J. and O'Geen, Henriette and Ouyang, Zhengqing and Patacsil, Dorrelyn and Raha, Debasish and Ramirez, Lucia and Reed, Brian and Shi, Minyi and Slifer, Teri and Witt, Heather and Wu, Linfeng and Xu, Xiaoqin and Yan, Koon Kiu and Yang, Xinqiong and Struhl, Kevin and Weissman, Sherman M. and Penalva, Luiz O. and Karmakar, Subhradip and Bhanvadia, Raj R. and Choudhury, Alina and Domanus, Marc and Ma, Lijia and Moran, Jennifer and Victorsen, Alec and Auer, Thomas and Centanin, Lazaro and Eichenlaub, Michael and Gruhl, Franziska and Heermann, Stephan and Hoeckendorf, Burkhard and Inoue, Daigo and Kellner, Tanja and Kirchmaier, Stephan and Mueller, Claudia and Reinhardt, Robert and Schertel, Lea and Schneider, Stephanie and Sinn, Rebecca and Wittbrodt, Beate and Wittbrodt, Jochen and Jain, Gaurav and Balasundaram, Gayathri and Bates, Daniel L. and Byron, Rachel and Canfield, Theresa K. and Diegel, Morgan J. and Dunn, Douglas and Ebersol, Abigail K. and Frum, Tristan and Garg, Kavita and Gist, Erica and Hansen, R. Scott and Boatman, Lisa and Haugen, Eric and Humbert, Richard and Johnson, Audra K. and Johnson, Ericka M. and Kutyavin, Tattyana V. and Lee, Kristen and Lotakis, Dimitra and Maurano, Matthew T. and Neph, Shane J. and Neri, Fiedencio V. and Nguyen, Eric D. and Qu, Hongzhu and Reynolds, Alex P. and Roach, Vaughn and Rynes, Eric and Sanchez, Minerva E. and Sandstrom, Richard S. and Shafer, Anthony O. and Stergachis, Andrew B. and Thomas, Sean and Vernot, Benjamin and Vierstra, Jeff and Vong, Shinny and Wang, Hao and Weaver, Molly A. and Yan, Yongqi and Zhang, Miaohua and Akey, Joshua M. and Bender, Michael and Dorschner, Michael O. and Groudine, Mark and MacCoss, Michael J. and Navas, Patrick and Stamatoyannopoulos, George and Beal, Kathryn and Brazma, Alvis and Flicek, Paul and Johnson, Nathan and Lukk, Margus and Luscombe, Nicholas M. and Sobral, Daniel and Vaquerizas, Juan M. and Batzoglou, Serafim and Sidow, Arend and Hussami, Nadine and Kyriazopoulou-Panagiotopoulou, Sofia and Libbrecht, Max W. and Schaub, Marc A. and Miller, Webb and Bickel, Peter J. and Banfai, Balazs and Boley, Nathan P. and Huang, Haiyan and Li, Jingyi Jessica and Noble, William Stafford and Bilmes, Jeffrey A. and Buske, Orion J. and Sahu, Avinash D. and Kharchenko, Peter V. and Park, Peter J. and Baker, Dannon and Taylor, James and Lochovsky, Lucas},
  date = {2012},
  doi = {10/bg9d},
  eprint = {22955616},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Dunham et al. - 2012 - An integrated encyclopedia of DNA elements in the .pdf},
  issn = {14764687},
  journaltitle = {Nature},
  number = {7414},
  pages = {57-74},
  title = {An Integrated Encyclopedia of {{DNA}} Elements in the Human Genome},
  volume = {489}
}

@article{Dunkler2010a,
  abstract = {MOTIVATION: Univariate Cox regression (COX) is often used to select genes possibly linked to survival. With non-proportional hazards (NPH), COX could lead to under- or over-estimation of effects. The effect size measure c=P(T(1)},
  author = {Dunkler, Daniela and Schemper, Michael and Heinze, Georg},
  date = {2010-03-15},
  doi = {10/cds3c7},
  eprint = {20118118},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Dunkler et al. - 2010 - Gene selection in microarray survival studies unde.pdf},
  issn = {1367-4811},
  journaltitle = {Bioinformatics (Oxford, England)},
  keywords = {Gene Expression Profiling,Gene Expression Profiling: methods,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,Regression Analysis,Transcription Factors,Transcription Factors: genetics},
  number = {6},
  pages = {784-90},
  title = {Gene Selection in Microarray Survival Studies under Possibly Non-Proportional Hazards.},
  volume = {26}
}

@article{Durinck2008,
  author = {Durinck, Steffen and Huber, Wolfgang},
  date = {2008},
  file = {/Users/ryan/Documents/Zotero Library/Durinck and Huber - 2008 - The biomaRt user ’ s guide.pdf},
  pages = {1-22},
  title = {The {{biomaRt}} User ' s Guide}
}

@article{Dutta2012a,
  abstract = {UNLABELLED: 

BACKGROUND: Identification of canonical pathways through enrichment of differentially expressed genes in a given pathway is a widely used method for interpreting gene lists generated from high-throughput experimental studies. However, most algorithms treat pathways as sets of genes, disregarding any inter- and intra-pathway connectivity information, and do not provide insights beyond identifying lists of pathways.

RESULTS: We developed an algorithm (PathNet) that utilizes the connectivity information in canonical pathway descriptions to help identify study-relevant pathways and characterize non-obvious dependencies and connections among pathways using gene expression data. PathNet considers both the differential expression of genes and their pathway neighbors to strengthen the evidence that a pathway is implicated in the biological conditions characterizing the experiment. As an adjunct to this analysis, PathNet uses the connectivity of the differentially expressed genes among all pathways to score pathway contextual associations and statistically identify biological relations among pathways. In this study, we used PathNet to identify biologically relevant results in two Alzheimer's disease microarray datasets, and compared its performance with existing methods. Importantly, PathNet identified de-regulation of the ubiquitin-mediated proteolysis pathway as an important component in Alzheimer's disease progression, despite the absence of this pathway in the standard enrichment analyses.

CONCLUSIONS: PathNet is a novel method for identifying enrichment and association between canonical pathways in the context of gene expression data. It takes into account topological information present in pathways to reveal biological information. PathNet is available as an R workspace image from http://www.bhsai.org/downloads/pathnet/.},
  author = {Dutta, Bhaskar and Wallqvist, Anders and Reifman, Jaques},
  date = {2012-01},
  doi = {10/ggcxkc},
  eprint = {23006764},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Dutta et al. - 2012 - PathNet a tool for pathway analysis using topolog.pdf},
  issn = {1751-0473},
  journaltitle = {Source code for biology and medicine},
  keywords = {canonical pathways,pathway association,pathway enrichment,pathway interaction,pathway topology},
  number = {1},
  pages = {10},
  title = {{{PathNet}}: A Tool for Pathway Analysis Using Topological Information.},
  volume = {7}
}

@article{Ebeling2011b,
  abstract = {The long-tailed macaque, also referred to as cynomolgus monkey (Macaca fascicularis), is one of the most important nonhuman primate animal models in basic and applied biomedical research. To improve the predictive power of primate experiments for humans, we determined the genome sequence of a Macaca fascicularis female of Mauritian origin using a whole-genome shotgun sequencing approach. We applied a template switch strategy that uses either the rhesus or the human genome to assemble sequence reads. The sixfold sequence coverage of the draft genome sequence enabled discovery of about 2.1 million potential single-nucleotide polymorphisms based on occurrence of a dimorphic nucleotide at a given position in the genome sequence. Homology-based annotation allowed us to identify 17,387 orthologs of human protein-coding genes in the M. fascicularis draft genome, and the predicted transcripts enabled the design of a M. fascicularis-specific gene expression microarray. Using liver samples from 36 individuals of different geographic origin we identified 718 genes with highly variable expression in liver, whereas the majority of the transcriptome shows relatively stable and comparable expression. Knowledge of the M. fascicularis draft genome is an important contribution to both the use of this animal in disease models and the safety assessment of drugs and their metabolites. In particular, this information allows high-resolution genotyping and microarray-based gene-expression profiling for animal stratification, thereby allowing the use of well-characterized animals for safety testing. Finally, the genome sequence presented here is a significant contribution to the global "3R" animal welfare initiative, which has the goal to reduce, refine, and replace animal experiments.},
  author = {Ebeling, Martin and K\"ung, Erich and See, Angela and Broger, Clemens and Steiner, Guido and Berrera, Marco and Heckel, Tobias and Iniguez, Leonardo and Albert, Thomas and Schmucki, Roland and Biller, Hermann and Singer, Thomas and Certa, Ulrich},
  date = {2011-10},
  doi = {10/d5rpxw},
  eprint = {21862625},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Ebeling et al. - 2011 - Genome-based analysis of the nonhuman primate Maca.pdf},
  issn = {1549-5469},
  journaltitle = {Genome research},
  keywords = {Animal,Animals,cyno-genome,cyno-project,Cytochrome P-450 Enzyme System,Cytochrome P-450 Enzyme System: genetics,Cytokines,Cytokines: genetics,DNA,DNA: genetics,DNA: isolation & purification,Drug Evaluation,Female,Gene Expression Profiling,Gene Expression Profiling: methods,Genetic,Genome,High-Throughput Nucleotide Sequencing,Humans,Liver,Liver: metabolism,Macaca fascicularis,Macaca fascicularis: genetics,Models,Nucleic Acid,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,Organic Anion Transporters,Organic Anion Transporters: genetics,Phylogeny,Polymorphism,Preclinical,Sequence Analysis,Sequence Homology,Single Nucleotide,Transcription},
  number = {10},
  pages = {1746-56},
  title = {Genome-Based Analysis of the Nonhuman Primate {{Macaca}} Fascicularis as a Model for Drug Safety Assessment.},
  volume = {21}
}

@article{Ecker2012,
  abstract = {The Encyclopedia of DNA Elements (ENCODE) project dishes up a hearty banquet of data that illuminate the roles of the functional elements of the human genome. Here, six scientists describe the project and discuss how the data are influencing research directions across many fields. See Articles p.57, p.75, p.83, p.91, p.101 \& Letter p.109},
  author = {Ecker, Joseph R. and a. Bickmore, Wendy and Barroso, In\^es and Pritchard, Jonathan K. and Gilad, Yoav and Segal, Eran},
  date = {2012},
  doi = {10/ggcxkd},
  eprint = {22955614},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Ecker et al. - 2012 - Genomics ENCODE explained.pdf},
  isbn = {1476-4687 (Electronic)\textbackslash{}n0028-0836 (Linking)},
  issn = {0028-0836},
  journaltitle = {Nature},
  number = {7414},
  pages = {52-55},
  title = {Genomics: {{ENCODE}} Explained},
  volume = {489}
}

@article{Efron1983,
  abstract = {We construct a prediction rule on the basis of some data, and then wish to estimate the error rate of this rule in classifying future observations. Cross-validation provides a nearly unbiased estimate, using only the original data. Cross-validation turns out to be related ... \textbackslash{}n},
  author = {Efron, Bradley},
  date = {1983},
  doi = {10/gdnbtp},
  eprint = {2288636?origin=crossref%5Cnpapers3://publication/doi/10.2307/2288636},
  eprinttype = {jstor},
  file = {/Users/ryan/Documents/Zotero Library/Efron - 1983 - Estimating the Error Rate of a Prediction Rule Im.pdf},
  isbn = {01621459},
  issn = {01621459},
  journaltitle = {Journal of the American Statistical Association},
  keywords = {anova,bootstrap,decomposition,logistic regression,prediction problem},
  number = {382},
  pages = {316},
  title = {Estimating the {{Error Rate}} of a {{Prediction Rule}}: {{Improvement}} on {{Cross}}-{{Validation}}},
  volume = {78}
}

@article{Efron1983a,
  abstract = {This is an invited expository article for The American Statistician. It reviews the nonparametric estimation of statistical error, mainly the bias and standard error of an estimator, or the error rate of a prediction rule. The presentation is written at a relaxed mathematical level, omitting most proofs, regularity conditions, and technical details.},
  author = {Efron, Bradley and Gong, Gail},
  date = {1983},
  doi = {10/gfxsm7},
  eprint = {2685844},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Efron and Gong - 1983 - A leisurely look at the bootstrap, the jackknife, .pdf},
  isbn = {08834237},
  issn = {0003-1305},
  journaltitle = {American Statistician},
  keywords = {bias estimation,error rate prediction,nonparametric confidence intervals,nonparametric standard errors,variance estimation},
  number = {1},
  pages = {36-48},
  title = {A Leisurely Look at the Bootstrap, the Jackknife, and Cross-Validation},
  volume = {37}
}

@article{Efron1997,
  abstract = {A study investigates the error rate of a rule for predicting future responses constructed from a training set of data. Results are nonparametric and apply to any possible prediction rule.},
  author = {Efron, B. and Tibshirani, R.},
  date = {1997},
  doi = {10/gfts5c},
  eprint = {370},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Efron and Tibshirani - 1997 - Improvements on cross-validation The .632 plus bo.pdf},
  isbn = {0162-1459},
  issn = {0162-1459},
  journaltitle = {Journal of the American Statistical Association},
  keywords = {classification,cross-validation bootstrap,prediction rule},
  number = {438},
  pages = {548},
  title = {Improvements on Cross-Validation: {{The}} .632 plus Bootstrap Method},
  volume = {92}
}

@article{Efron2002,
  abstract = {In a classic two-sample problem, one might use Wilcoxon's statistic to test for a difference between treatment and control subjects. The analogous microarray experiment yields thousands of Wilcoxon statistics, one for each gene on the array, and confronts the statistician with a difficult simultaneous inference situation. We will discuss two inferential approaches to this problem: an empirical Bayes method that requires very little a priori Bayesian modeling, and the frequentist method of ``false discovery rates'' proposed by Benjamini and Hochberg in 1995. It turns out that the two methods are closely related and can be used together to produce sensible simultaneous inferences. Genet. Epidemiol. 23:70\textendash{}86, 2002. \textcopyright{} 2002 Wiley-Liss, Inc.},
  author = {Efron, Bradley and Tibshirani, Robert},
  date = {2002},
  doi = {10/bkspzk},
  file = {/Users/ryan/Documents/Zotero Library/Efron and Tibshirani - 2002 - Empirical bayes methods and false discovery rates .pdf;/Users/ryan/Zotero/storage/DF4Z9GQI/gepi.html},
  ids = {Efron2002},
  issn = {1098-2272},
  journaltitle = {Genetic Epidemiology},
  keywords = {a posteriori probability of,a posteriori probability of gene significance,gene,multiple comparisons,simultaneous hypothesis tests},
  langid = {english},
  number = {1},
  pages = {70-86},
  title = {Empirical Bayes Methods and False Discovery Rates for Microarrays},
  volume = {23}
}

@article{Einecke2010,
  abstract = {Kidney transplant recipients that develop signs of renal dysfunction or proteinuria one or more years after transplantation are at considerable risk for progression to renal failure. To assess the kidney at this time, a "for-cause" biopsy is performed, but this provides little indication as to which recipients will go on to organ failure. In an attempt to identify molecules that could provide this information, we used microarrays to analyze gene expression in 105 for-cause biopsies taken between 1 and 31 years after transplantation. Using supervised principal components analysis, we derived a molecular classifier to predict graft loss. The genes associated with graft failure were related to tissue injury, epithelial dedifferentiation, matrix remodeling, and TGF-beta effects and showed little overlap with rejection-associated genes. We assigned a prognostic molecular risk score to each patient, identifying those at high or low risk for graft loss. The molecular risk score was correlated with interstitial fibrosis, tubular atrophy, tubulitis, interstitial inflammation, proteinuria, and glomerular filtration rate. In multivariate analysis, molecular risk score, peritubular capillary basement membrane multilayering, arteriolar hyalinosis, and proteinuria were independent predictors of graft loss. In an independent validation set, the molecular risk score was the only predictor of graft loss. Thus, the molecular risk score reflects active injury and is superior to either scarring or function in predicting graft failure.},
  author = {Einecke, Gunilla and Reeve, Jeff and Sis, Banu and Mengel, Michael and Hidalgo, Luis and Famulski, Konrad S and Matas, Arthur and Kasiske, Bert and Kaplan, Bruce and Halloran, Philip F},
  date = {2010-06-01},
  doi = {10/dkhgsf},
  eprint = {20501945},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Einecke et al. - 2010 - A molecular classifier for predicting future graft.pdf},
  issn = {0021-9738},
  journaltitle = {Journal of Clinical Investigation},
  keywords = {Biopsy,Capillaries,Capillaries: pathology,Disease Progression,Forecasting,Glomerular Filtration Rate,Graft Rejection,Graft Rejection: diagnosis,Graft Rejection: pathology,Humans,Kidney,Kidney Transplantation,Kidney Transplantation: pathology,Kidney: pathology,Proteinuria,Proteinuria: pathology,Renal Insufficiency,Renal Insufficiency: pathology,Transforming Growth Factor beta},
  number = {6},
  pages = {1862-1872},
  title = {A Molecular Classifier for Predicting Future Graft Loss in Late Kidney Transplant Biopsies},
  volume = {120}
}

@article{Eksi2013,
  author = {Eksi, Ridvan and Li, Hong-Dong and Menon, Rajasree and Wen, Yuchen and Omenn, Gilbert S. and Kretzler, Matthias and Guan, Yuanfang},
  date = {2013-11-07},
  doi = {10/f5kw8p},
  editor = {Iakoucheva, Lilia M.},
  file = {/Users/ryan/Documents/Zotero Library/Eksi et al. - 2013 - Systematically Differentiating Functions for Alter.pdf},
  issn = {1553-7358},
  journaltitle = {PLoS Computational Biology},
  number = {11},
  pages = {e1003314},
  title = {Systematically {{Differentiating Functions}} for {{Alternatively Spliced Isoforms}} through {{Integrating RNA}}-Seq {{Data}}},
  volume = {9}
}

@article{Elzawahry2014,
  abstract = {Innate immune response involves protein-protein interactions, deoxyribonucleic acid (DNA)-protein interactions and signaling cascades. So far, thousands of protein-protein interactions have been curated as a static interaction map. However, protein-protein interactions involved in innate immune response are dynamic. We recorded the dynamics in the interactome during innate immune response by combining gene expression data of lipopolysaccharide (LPS)-stimulated dendritic cells with protein-protein interactions data. We identified the differences in interactome during innate immune response by constructing differential networks and identifying protein modules, which were up-/down-regulated at each stage during the innate immune response. For each protein complex, we identified enriched biological processes and pathways. In addition, we identified core interactions that are conserved throughout the innate immune response and their enriched gene ontology terms and pathways. We defined two novel measures to assess the differences between network maps at different time points. We found that the protein interaction network at 1 hour after LPS stimulation has the highest interactions protein ratio, which indicates a role for proteins with large number of interactions in innate immune response. A pairwise differential matrix allows for the global visualization of the differences between different networks. We investigated the toll-like receptor subnetwork and found that S100A8 is down-regulated in dendritic cells after LPS stimulation. Identified protein complexes have a crucial role not only in innate immunity, but also in circadian rhythms, pathways involved in cancer, and p53 pathways. The study confirmed previous work that reported a strong correlation between cancer and immunity.},
  author = {Elzawahry, Asmaa and Patil, Ashwini and Kumagai, Yutaro and Suzuki, Yutaka and Nakai, Kenta},
  date = {2014-01-06},
  doi = {10/ggcxkf},
  eprint = {24453478},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Elzawahry et al. - 2014 - Innate immunity interactome dynamics..pdf},
  issn = {1177-6250},
  journaltitle = {Gene regulation and systems biology},
  keywords = {differential networks,gene expression,innate immunity,interactome dynamics,protein-protein interactions},
  pages = {1-15},
  title = {Innate Immunity Interactome Dynamics.},
  volume = {8}
}

@article{Emig2010a,
  abstract = {Alternative splicing is an important mechanism for increasing protein diversity. However, its functional effects are largely unknown. Here, we present our new software workflow composed of the open-source application AltAnalyze and the Cytoscape plugin DomainGraph. Both programs provide an intuitive and comprehensive end-to-end solution for the analysis and visualization of alternative splicing data from Affymetrix Exon and Gene Arrays at the level of proteins, domains, microRNA binding sites, molecular interactions and pathways. Our software tools include easy-to-use graphical user interfaces, rigorous statistical methods (FIRMA, MiDAS and DABG filtering) and do not require prior knowledge of exon array analysis or programming. They provide new methods for automatic interpretation and visualization of the effects of alternative exon inclusion on protein domain composition and microRNA binding sites. These data can be visualized together with affected pathways and gene or protein interaction networks, allowing a straightforward identification of potential biological effects due to alternative splicing at different levels of granularity. Our programs are available at http://www.altanalyze.org and http://www.domaingraph.de. These websites also include extensive documentation, tutorials and sample data.},
  author = {Emig, Dorothea and Salomonis, Nathan and Baumbach, Jan and Lengauer, Thomas and Conklin, Bruce R and Albrecht, Mario},
  date = {2010-07-01},
  doi = {10/b5d3d9},
  eprint = {20513647},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Emig et al. - 2010 - AltAnalyze and DomainGraph analyzing and visualiz.pdf;/Users/ryan/Documents/Zotero Library/Emig et al. - 2010 - AltAnalyze and DomainGraph analyzing and visualiz2.pdf},
  issn = {1362-4962},
  issue = {suppl\_2},
  journaltitle = {Nucleic Acids Research},
  keywords = {Alternative Splicing,Animals,Computer Graphics,Exons,Gene Expression Profiling,Humans,Internet,Mice,Rats,Software},
  pages = {W755-W762},
  title = {{{AltAnalyze}} and {{DomainGraph}}: Analyzing and Visualizing Exon Expression Data},
  volume = {38}
}

@software{endrebakkenstovnerEpicDiffuseDomain2019,
  abstract = {epic is a software package for finding medium to diffusely enriched domains in chip-seq data. It is a fast, parallel and memory-efficient implementation of the incredibly popular SICER algorithm. By running epic on a set of data ("ChIP") files and control ("Input") files, epic is able to quickly differentially enriched regions. epic is an improvement over the original SICER by being faster, more memory efficient, multicore, and significantly much easier to install and use.},
  author = {{Endre Bakken Stovner}},
  date = {2019-07-20T13:59:09Z},
  ids = {gh-epic},
  keywords = {⛔ No DOI found,bioinformatics,chip-seq,chip-seq-callers,chipseq,peak-caller,sicer,sicer-algorithm},
  organization = {{BioCore, NTNU}},
  origdate = {2016-04-01T01:59:44Z},
  title = {Epic: Diffuse Domain {{ChIP}}-{{Seq}} Caller Based on {{SICER}}},
  url = {https://github.com/biocore-ntnu/epic},
  urldate = {2019-11-14}
}

@article{Engstrom2013,
  abstract = {High-throughput RNA sequencing is an increasingly accessible method for studying gene structure and activity on a genome-wide scale. A critical step in RNA-seq data analysis is the alignment of partial transcript reads to a reference genome sequence. To assess the performance of current mapping software, we invited developers of RNA-seq aligners to process four large human and mouse RNA-seq data sets. In total, we compared 26 mapping protocols based on 11 programs and pipelines and found major performance differences between methods on numerous benchmarks, including alignment yield, basewise accuracy, mismatch and gap placement, exon junction discovery and suitability of alignments for transcript reconstruction. We observed concordant results on real and simulated RNA-seq data, confirming the relevance of the metrics employed. Future developments in RNA-seq alignment methods would benefit from improved placement of multimapped reads, balanced utilization of existing gene annotation and a reduced false discovery rate for splice junctions.},
  author = {Engstr\"om, P\"ar G and Steijger, Tamara and Sipos, Botond and Grant, Gregory R and Kahles, Andr\'e and Alioto, Tyler and Behr, Jonas and Bertone, Paul and Bohnert, Regina and Campagna, Davide and a Davis, Carrie and Dobin, Alexander and Gingeras, Thomas R and Goldman, Nick and Guig\'o, Roderic and Harrow, Jennifer and Hubbard, Tim J and Jean, G\'eraldine and Kosarev, Peter and Li, Sheng and Liu, Jinze and Mason, Christopher E and Molodtsov, Vladimir and Ning, Zemin and Ponstingl, Hannes and Prins, Jan F and R\"atsch, Gunnar and Ribeca, Paolo and Seledtsov, Igor and Solovyev, Victor and Valle, Giorgio and Vitulo, Nicola and Wang, Kai and Wu, Thomas D and Zeller, Georg},
  date = {2013-11-03},
  doi = {10/pt9},
  eprint = {24185836},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Engström et al. - 2013 - Systematic evaluation of spliced alignment program.pdf},
  issn = {1548-7105},
  issue = {November},
  journaltitle = {Nature methods},
  pages = {10-12},
  title = {Systematic Evaluation of Spliced Alignment Programs for {{RNA}}-Seq Data.}
}

@article{Ernst2012,
  abstract = {Methylation of histone H3 at lysine 4 (H3K4) is a conserved feature of active chromatin catalyzed by methyltransferases of the SET1-family (SET1A, SET1B, MLL1, MLL2, MLL3 and MLL4 in humans). These enzymes participate in diverse gene regulatory networks with a multitude of known biological functions, including direct involvement in several human disease states. Unlike most lysine methyltransferases, SET1-family enzymes are only fully active in the context of a multi-subunit complex, which includes a protein module comprised of WDR5, RbBP5, ASH2L and DPY-30 (WRAD). These proteins bind in close proximity to the catalytic SET domain of SET1-family enzymes and stimulate H3K4 methyltransferase activity. The mechanism by which WRAD promotes catalysis involves elements of allosteric control and possibly the utilization of a second H3K4 methyltransferase active site present within WRAD itself. WRAD components also engage in physical interactions that recruit SET1-family proteins to target sites on chromatin. Here, the known molecular mechanisms through which WRAD enables the function of SET1-related enzymes will be reviewed.},
  author = {Ernst, Patricia and Vakoc, Christopher R.},
  date = {2012-05-01},
  doi = {10/f33wqh},
  eprint = {22652693},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Ernst and Vakoc - 2012 - WRAD enabler of the SET1-family of H3K4 methyltra.pdf},
  isbn = {2041-2649},
  issn = {2041-2649},
  journaltitle = {Briefings in Functional Genomics},
  keywords = {ASH2L,DPY-30,MLL,RbBP5,SET1,WDR5},
  number = {3},
  pages = {217-226},
  title = {{{WRAD}}: Enabler of the {{SET1}}-Family of {{H3K4}} Methyltransferases},
  volume = {11}
}

@article{Esnaola2013,
  author = {Esnaola, Mikel and Puig, Pedro and Gonzalez, David and Castelo, Robert and Gonzalez, Juan R},
  date = {2013},
  doi = {10/gb8vvs},
  file = {/Users/ryan/Documents/Zotero Library/Esnaola et al. - 2013 - A flexible count data model to fit the wide divers.pdf},
  issn = {1471-2105},
  journaltitle = {BMC Bioinformatics},
  number = {1},
  pages = {254},
  title = {A Flexible Count Data Model to Fit the Wide Diversity of Expression Profiles Arising from Extensively Replicated {{RNA}}-Seq Experiments},
  volume = {14}
}

@article{Etz2015,
  author = {Etz, Alexander},
  date = {2015-06-10},
  file = {/Users/ryan/Documents/Zotero Library/Etz - 2015 - Using Bayes Factors to Get the Most out of Linear .pdf},
  journaltitle = {The Winnower},
  keywords = {⛔ No DOI found,bayes factor,bayesian,linear regression,model selection},
  title = {Using {{Bayes Factors}} to {{Get}} the {{Most}} out of {{Linear Regression}} : {{A Practical Guide Using R}}}
}

@article{Etz2016,
  abstract = {We revisit the results of the recent Reproducibility Project: Psychology by the Open Science Collaboration. We compute Bayes factors\textemdash{}a quantity that can be used to express comparative evidence for an hypothesis but also for the null hypothesis\textemdash{}for a large subset (N = 72) of the original papers and their corresponding replication attempts. In our computation, we take into account the likely scenario that publication bias had distorted the originally published results. Overall, 75\% of studies gave qualitatively similar results in terms of the amount of evidence provided. However, the evidence was often weak (i.e., Bayes factor {$<$} 10). The majority of the studies (64\%) did not provide strong evidence for either the null or the alternative hypothesis in either the original or the replication, and no replication attempts provided strong evidence in favor of the null. In all cases where the original paper provided strong evidence but the replication did not (15\%), the sample size in the replication was smaller than the original. Where the replication provided strong evidence but the original did not (10\%), the replication sample size was larger. We conclude that the apparent failure of the Reproducibility Project to replicate many target effects can be adequately explained by overestimation of effect sizes (or overestimation of evidence against the null hypothesis) due to small sample sizes and publication bias in the psychological literature. We further conclude that traditional sample sizes are insufficient and that a more widespread adoption of Bayesian methods is desirable.},
  author = {Etz, Alexander and Vandekerckhove, Joachim},
  date = {2016},
  doi = {10/f8tq2f},
  eprint = {26919473},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Etz and Vandekerckhove - 2016 - A Bayesian perspective on the reproducibility proj.pdf},
  isbn = {1932-6203},
  issn = {19326203},
  journaltitle = {PLoS ONE},
  number = {2},
  pages = {1-12},
  title = {A {{Bayesian}} Perspective on the Reproducibility Project: {{Psychology}}},
  volume = {11}
}

@article{Fan2005,
  abstract = {BACKGROUND: Various analytical methods exist that first quantify gene expression and then analyze differentially expressed genes from Affymetrix GeneChip gene expression analysis array data. These methods differ in the choice of probe measure (quantification of probe hybridization), summarizing multiple probe intensities into a gene expression value, and analysis of differential gene expression. Research papers that describe these methods focus on performance, and how their approaches differ from others. To better understand the common features and differences between various methods, and to evaluate their impact on the results of gene expression analysis, we describe a class of models, referred to as generalized probe models (GPMs), which encompass various currently available methods.

RESULTS: Using an empirical dataset, we compared different formulations of GPMs, and GPMs with three other commonly used methods, i.e. MAS 5.0, dChip, and RMA. The comparison shows that, on a genome-wide scale , different methods yield similar results if the same probe measures are chosen.

CONCLUSION: In this paper we present a general framework, i.e. GPMs, which encompasses various methods. GPMs permit the use of a wide range of probe measures and facilitate appropriate comparison between commonly used methods. We demonstrate that the dissimilar results stem primarily from different choice of probe measures, rather than other factors.},
  author = {Fan, Wenhong and Pritchard, Joel I and Olson, James M and Khalid, Najma and Zhao, Lue Ping},
  date = {2005-01},
  doi = {10/dq4dw6},
  eprint = {15710039},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Fan et al. - 2005 - A class of models for analyzing GeneChip gene expr.pdf},
  issn = {1471-2164},
  journaltitle = {BMC genomics},
  keywords = {Algorithms,Cell Line; Tumor,Data Interpretation; Statistical,DNA Primers,DNA Primers: chemistry,Gene Expression Profiling,Genome,Humans,Models; Statistical,Nucleic Acid Hybridization,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,Software},
  pages = {16},
  title = {A Class of Models for Analyzing {{GeneChip}} Gene Expression Analysis Array Data.},
  volume = {6}
}

@article{Fan2011,
  abstract = {The Affymetrix GeneChip Exon Array can be used to detect alternative splice variants. Microarray Detection of Alternative Splicing (MIDAS) and Partek(\textregistered{}) Genomics Suite (Partek(\textregistered{}) GS) are among the most popular analytical methods used to analyze exon array data. While both methods utilize statistical significance for testing, MIDAS and Partek(\textregistered{}) GS could produce somewhat different results due to different underlying assumptions. Comparing MIDAS and Partek(\textregistered{}) GS is quite difficult due to their substantially different mathematical formulations and assumptions regarding alternative splice variants. For meaningful comparison, we have used the previously published generalized probe model (GPM) which encompasses both MIDAS and Partek(\textregistered{}) GS under different assumptions. We analyzed a colon cancer exon array data set using MIDAS, Partek(\textregistered{}) GS and GPM. MIDAS and Partek(\textregistered{}) GS produced quite different sets of genes that are considered to have alternative splice variants. Further, we found that GPM produced results similar to MIDAS as well as to Partek(\textregistered{}) GS under their respective assumptions. Within the GPM, we show how discoveries relating to alternative variants can be quite different due to different assumptions. MIDAS focuses on relative changes in expression values across different exons within genes and tends to be robust but less efficient. Partek(\textregistered{}) GS, however, uses absolute expression values of individual exons within genes and tends to be more efficient but more sensitive to the presence of outliers. From our observations, we conclude that MIDAS and Partek(\textregistered{}) GS produce complementary results, and discoveries from both analyses should be considered.},
  author = {Fan, Wenhong and Stirewalt, Derek L and Radich, Jerald P and Zhao, Lueping},
  date = {2011-09},
  eprint = {23675234},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Fan et al. - 2011 - Comparison of Two Methods for Detecting Alternativ.pdf},
  issn = {1550-9702},
  journaltitle = {International journal of biomedical science : IJBS},
  keywords = {alternative splicing,exon,gene expression analysis},
  number = {3},
  pages = {172-80},
  title = {Comparison of {{Two Methods}} for {{Detecting Alternative Splice Variants Using GeneChip}}(\textregistered{}) {{Exon Arrays}}.},
  url = {http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=3614835&tool=pmcentrez&rendertype=abstract},
  volume = {7}
}

@article{Faraway2002,
  author = {Faraway, JJ},
  date = {2002},
  file = {/Users/ryan/Documents/Zotero Library/Faraway - 2002 - Practical Regression and ANOVA using R.pdf},
  issue = {July},
  title = {Practical {{Regression}} and {{ANOVA}} Using {{R}}}
}

@article{Feng2011,
  abstract = {Model-based Analysis of ChIP-Seq (MACS) is a command-line tool designed by X. Shirley Liu and colleagues to analyze data generated by ChIP-Seq experiments in eukaryotes, especially mammals. MACS can be used to identify transcription factor binding sites and histone modification-enriched regions if the ChIP-Seq data, with or without control samples, are given. This unit describes two basic protocols that provide detailed information on how to use MACS to identify either the binding sites of a transcription factor or the enriched regions of a histone modification with broad peaks. Furthermore, the basic ideas for the MACS algorithm and its appropriate usage are discussed.},
  author = {Feng, Jianxing and Liu, Tao and Zhang, Yong},
  date = {2011-06},
  doi = {10/ccfz9s},
  eprint = {21633945},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Feng et al. - 2011 - Using MACS to identify peaks from ChIP-Seq data..pdf},
  isbn = {0471250953},
  issn = {1934-340X},
  journaltitle = {Current protocols in bioinformatics / editoral board, Andreas D. Baxevanis ... [et al.]},
  keywords = {Algorithms,Base Sequence,Binding Sites,Chromatin Immunoprecipitation,Chromatin Immunoprecipitation: methods,Computational Biology,Computational Biology: methods,Databases,DNA,Factual,Sequence Analysis,Transcription Factors,Transcription Factors: metabolism,tutorial},
  pages = {Unit 2.14},
  title = {Using {{MACS}} to Identify Peaks from {{ChIP}}-{{Seq}} Data.},
  volume = {Chapter 2}
}

@article{Fernandes2013,
  abstract = {Experimental variance is a major challenge when dealing with high-throughput sequencing data. This variance has several sources: sampling replication, technical replication, variability within biological conditions, and variability between biological conditions. The high per-sample cost of RNA-Seq often precludes the large number of experiments needed to partition observed variance into these categories as per standard ANOVA models. We show that the partitioning of within-condition to between-condition variation cannot reasonably be ignored, whether in single-organism RNA-Seq or in Meta-RNA-Seq experiments, and further find that commonly-used RNA-Seq analysis tools, as described in the literature, do not enforce the constraint that the sum of relative expression levels must be one, and thus report expression levels that are systematically distorted. These two factors lead to misleading inferences if not properly accommodated. As it is usually only the biological between-condition and within-condition differences that are of interest, we developed ALDEx, an ANOVA-like differential expression procedure, to identify genes with greater between- to within-condition differences. We show that the presence of differential expression and the magnitude of these comparative differences can be reasonably estimated with even very small sample sizes.},
  author = {Fernandes, Andrew D and Macklaim, Jean M and Linn, Thomas G and Reid, Gregor and Gloor, Gregory B},
  date = {2013-01},
  doi = {10/f48p9j},
  eprint = {23843979},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Fernandes et al. - 2013 - ANOVA-like differential expression (ALDEx) analysi.pdf},
  issn = {1932-6203},
  journaltitle = {PloS one},
  number = {7},
  pages = {e67019},
  title = {{{ANOVA}}-like Differential Expression ({{ALDEx}}) Analysis for Mixed Population {{RNA}}-{{Seq}}.},
  volume = {8}
}

@article{Ferreira2013,
  author = {Ferreira, P. G. and Jares, P. and Rico, D. and Gomez-Lopez, G. and Martinez-Trillos, a. and Villamor, N. and Ecker, S. and Gonzalez-Perez, a. and Knowles, D. G. and Monlong, J. and Johnson, R. and Quesada, V. and Gouin, a. and Djebali, S. and Lopez-Guerra, M. and Colomer, D. and Royo, C. and Cazorla, M. and Pinyol, M. and Clot, G. and Aymerich, M. and Rozman, M. and Kulis, M. and Tamborero, D. and Papasaikas, P. and Blanc, J. and Gut, M. and Gut, I. and Puente, X. S. and Pisano, D. G. and Martin-Subero, J. I. and Lopez-Bigas, N. and Lopez-Guillermo, a. and Valencia, a. and Lopez-Otin, C. and Campo, E. and Guigo, R.},
  date = {2013-11-21},
  doi = {10/f5rft8},
  file = {/Users/ryan/Documents/Zotero Library/Ferreira et al. - 2013 - Transcriptome characterization by RNA sequencing i.pdf},
  issn = {1088-9051},
  journaltitle = {Genome Research},
  title = {Transcriptome Characterization by {{RNA}} Sequencing Identifies a Major Molecular and Clinical Subdivision in Chronic Lymphocytic Leukemia}
}

@article{Figueroa2012,
  abstract = {BACKGROUND: Supervised learning methods need annotated data in order to generate efficient models. Annotated data, however, is a relatively scarce resource and can be expensive to obtain. For both passive and active learning methods, there is a need to estimate the size of the annotated sample required to reach a performance target.

METHODS: We designed and implemented a method that fits an inverse power law model to points of a given learning curve created using a small annotated training set. Fitting is carried out using nonlinear weighted least squares optimization. The fitted model is then used to predict the classifier's performance and confidence interval for larger sample sizes. For evaluation, the nonlinear weighted curve fitting method was applied to a set of learning curves generated using clinical text and waveform classification tasks with active and passive sampling methods, and predictions were validated using standard goodness of fit measures. As control we used an un-weighted fitting method.

RESULTS: A total of 568 models were fitted and the model predictions were compared with the observed performances. Depending on the data set and sampling method, it took between 80 to 560 annotated samples to achieve mean average and root mean squared error below 0.01. Results also show that our weighted fitting method outperformed the baseline un-weighted method (p {$<$} 0.05).

CONCLUSIONS: This paper describes a simple and effective sample size prediction algorithm that conducts weighted fitting of learning curves. The algorithm outperformed an un-weighted algorithm described in previous literature. It can help researchers determine annotation sample size for supervised machine learning.},
  author = {Figueroa, Rosa L and Zeng-Treitler, Qing and Kandula, Sasikiran and Ngo, Long H},
  date = {2012-01},
  doi = {10/gb345p},
  eprint = {22336388},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Figueroa et al. - 2012 - Predicting sample size required for classification.pdf},
  issn = {1472-6947},
  journaltitle = {BMC medical informatics and decision making},
  keywords = {Algorithms,Data Interpretation; Statistical,Diagnosis; Computer-Assisted,Humans,Learning Curve,Models; Statistical,Nonlinear Dynamics,Pattern Recognition; Automated,Predictive Value of Tests,Probability Learning,Problem-Based Learning,Problem-Based Learning: methods,Reproducibility of Results,Sample Size,Stochastic Processes},
  number = {1},
  pages = {8},
  title = {Predicting Sample Size Required for Classification Performance.},
  volume = {12}
}

@article{Frazee2014,
  abstract = {RNA-sequencing (RNA-seq) is a flexible technology for measuring genome-wide expression that is rapidly replacing microarrays as costs become comparable. Current differential expression analysis methods for RNA-seq data fall into two broad classes: (1) methods that quantify expression within the boundaries of genes previously published in databases and (2) methods that attempt to reconstruct full length RNA transcripts. The first class cannot discover differential expression outside of previously known genes. While the second approach does possess discovery capabilities, statistical analysis of differential expression is complicated by the ambiguity and variability incurred while assembling transcripts and estimating their abundances. Here, we propose a novel method that first identifies differentially expressed regions (DERs) of interest by assessing differential expression at each base of the genome. The method then segments the genome into regions comprised of bases showing similar differential expression signal, and then assigns a measure of statistical significance to each region. Optionally, DERs can be annotated using a reference database of genomic features. We compare our approach with leading competitors from both current classes of differential expression methods and highlight the strengths and weaknesses of each. A software implementation of our method is available on github (https://github.com/alyssafrazee/derfinder).},
  author = {Frazee, Alyssa C and Sabunciyan, Sarven and Hansen, Kasper D and a Irizarry, Rafael and Leek, Jeffrey T},
  date = {2014-01-06},
  doi = {10/gb87sq},
  eprint = {24398039},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Frazee et al. - 2014 - Differential expression analysis of RNA-seq data a.pdf},
  issn = {1468-4357},
  journaltitle = {Biostatistics (Oxford, England)},
  keywords = {bioinformatics,differential expression,false discovery rate,genomics,rna sequencing},
  pages = {1-14},
  title = {Differential Expression Analysis of {{RNA}}-Seq Data at Single-Base Resolution.}
}

@report{froussiosHowWellRNASeq2016,
  abstract = {Abstract
          
            RNA-seq experiments are usually carried out in three or fewer replicates. In order to work well with so few samples, Differential Gene Expression (DGE) tools typically assume the form of the underlying distribution of gene expression. A recent highly replicated study revealed that RNA-seq gene expression measurements in yeast are best represented as being drawn from an underlying negative binomial distribution. In this paper, the statistical properties of gene expression in the higher eukaryote
            Arabidopsis thaliana
            are shown to be essentially identical to those from yeast despite the large increase in the size and complexity of the transcriptome: Gene expression measurements from this model plant species are consistent with being drawn from an underlying negative binomial or log-normal distribution and the false positive rate performance of nine widely used DGE tools is not strongly affected by the additional size and complexity of the
            A. thaliana
            transcriptome. For RNA-seq data, we therefore recommend the use of DGE tools that are based on the negative binomial distribution.},
  author = {Froussios, Kimon and Schurch, Nick J. and Mackinnon, Katarzyna and Gierli\'nski, Marek and Duc, C\'eline and Simpson, Gordon G. and Barton, Geoffrey J.},
  date = {2016-12-02},
  doi = {10/dd8t},
  file = {/Users/ryan/Documents/Zotero Library/Froussios et al. - 2016 - How well do RNA-Seq differential gene expression t.pdf},
  ids = {Froussios2016},
  institution = {{Genomics}},
  keywords = {⛔ No DOI found},
  langid = {english},
  title = {How Well Do {{RNA}}-{{Seq}} Differential Gene Expression Tools Perform in a Eukaryote with a Complex Transcriptome?},
  type = {preprint},
  url = {http://biorxiv.org/lookup/doi/10.1101/090753},
  urldate = {2019-11-14}
}

@article{Fu2011,
  abstract = {We implement a unique strategy for single molecule counting termed stochastic labeling, where random attachment of a diverse set of labels converts a population of identical DNA molecules into a population of distinct DNA molecules suitable for threshold detection. The conceptual framework for stochastic labeling is developed and experimentally demonstrated by determining the absolute and relative number of selected genes after stochastically labeling approximately 360,000 different fragments of the human genome. The approach does not require the physical separation of molecules and takes advantage of highly parallel methods such as microarray and sequencing technologies to simultaneously count absolute numbers of multiple targets. Stochastic labeling should be particularly useful for determining the absolute numbers of RNA or DNA molecules in single cells.},
  author = {Fu, Glenn K and Hu, Jing and Wang, Pei-hua and Fodor, Stephen P A},
  date = {2011-05-31},
  doi = {10/ckt5zz},
  eprint = {21562209},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Fu et al. - 2011 - Counting individual DNA molecules by the stochasti.pdf},
  issn = {1091-6490},
  journaltitle = {Proceedings of the National Academy of Sciences of the United States of America},
  keywords = {B-Lymphocytes,B-Lymphocytes: cytology,DNA,DNA: analysis,Down Syndrome,Down Syndrome: genetics,Fluorescence,Fluorescence: methods,Genome,Human,Humans,Male,Microscopy,Models,Oligonucleotide Array Sequence Analysis,Poisson Distribution,Regression Analysis,Reproducibility of Results,RNA,RNA: analysis,Sequence Analysis,Statistical,Stochastic Processes},
  number = {22},
  pages = {9026-31},
  title = {Counting Individual {{DNA}} Molecules by the Stochastic Attachment of Diverse Labels.},
  volume = {108}
}

@article{Furey2012,
  abstract = {Chromatin immunoprecipitation experiments followed by sequencing (ChIP-seq) detect protein-DNA binding events and chemical modifications of histone proteins. Challenges in the standard ChIP-seq protocol have motivated recent enhancements in this approach, such as reducing the number of cells that are required and increasing the resolution. Complementary experimental approaches-for example, DNaseI hypersensitive site mapping and analysis of chromatin interactions that are mediated by particular proteins-provide additional information about DNA-binding proteins and their function. These data are now being used to identify variability in the functions of DNA-binding proteins across genomes and individuals. In this Review, I describe the latest advances in methods to detect and functionally characterize DNA-bound proteins. \textcopyright{} 2012 Macmillan Publishers Limited. All rights reserved.},
  author = {Furey, Terrence S.},
  date = {2012-12-23},
  doi = {10/f4gthj},
  file = {/Users/ryan/Documents/Zotero Library/Furey - 2012 - ChIP-seq and beyond New and improved methodologie.pdf},
  issn = {14710056},
  journaltitle = {Nature Reviews Genetics},
  number = {12},
  pages = {840-852},
  title = {{{ChIP}}-Seq and beyond: {{New}} and Improved Methodologies to Detect and Characterize Protein-{{DNA}} Interactions},
  volume = {13}
}

@article{GallegoRomero2014,
  abstract = {BACKGROUND: The use of low quality RNA samples in whole-genome gene expression profiling remains controversial. It is unclear if transcript degradation in low quality RNA samples occurs uniformly, in which case the effects of degradation can be corrected via data normalization, or whether different transcripts are degraded at different rates, potentially biasing measurements of expression levels. This concern has rendered the use of low quality RNA samples in whole-genome expression profiling problematic. Yet, low quality samples (for example, samples collected in the course of fieldwork) are at times the sole means of addressing specific questions.\textbackslash{}n\textbackslash{}nRESULTS: We sought to quantify the impact of variation in RNA quality on estimates of gene expression levels based on RNA-seq data. To do so, we collected expression data from tissue samples that were allowed to decay for varying amounts of time prior to RNA extraction. The RNA samples we collected spanned the entire range of RNA Integrity Number (RIN) values (a metric commonly used to assess RNA quality). We observed widespread effects of RNA quality on measurements of gene expression levels, as well as a slight but significant loss of library complexity in more degraded samples.\textbackslash{}n\textbackslash{}nCONCLUSIONS: While standard normalizations failed to account for the effects of degradation, we found that by explicitly controlling for the effects of RIN using a linear model framework we can correct for the majority of these effects. We conclude that in instances in which RIN and the effect of interest are not associated, this approach can help recover biologically meaningful signals in data from degraded RNA samples.},
  author = {Gallego Romero, Irene and Pai, Athma A and Tung, Jenny and Gilad, Yoav},
  date = {2014},
  doi = {10/f59ztg},
  eprint = {24885439},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Gallego Romero et al. - 2014 - RNA-seq impact of RNA degradation on transcript q.pdf},
  isbn = {1741-7007},
  issn = {1741-7007},
  journaltitle = {BMC biology},
  keywords = {Gene Expression Profiling,Genes,Humans,Molecular Sequence Annotation,Principal Component Analysis,RNA Stability,RNA Stability: genetics,RNA; Messenger,RNA; Messenger: genetics,RNA; Messenger: metabolism,Sequence Analysis; RNA,Sequence Analysis; RNA: methods,Statistics; Nonparametric},
  number = {1},
  pages = {42},
  title = {{{RNA}}-Seq: Impact of {{RNA}} Degradation on Transcript Quantification.},
  volume = {12}
}

@article{Gao2016,
  abstract = {Identifying latent structure in high-dimensional genomic data is essential for exploring biological processes. Here, we consider recovering gene co-expression networks from gene expression data, where each network encodes relationships between genes that are co-regulated by shared biological mechanisms. To do this, we develop a Bayesian statistical model for biclustering to infer subsets of co-regulated genes that covary in all of the samples or in only a subset of the samples. Our biclustering method, BicMix, allows overcomplete representations of the data, computational tractability, and joint modeling of unknown confounders and biological signals. Compared with related biclustering methods, BicMix recovers latent structure with higher precision across diverse simulation scenarios as compared to state-of-the-art biclustering methods. Further, we develop a principled method to recover context specific gene co-expression networks from the estimated sparse biclustering matrices. We apply BicMix to breast cancer gene expression data and to gene expression data from a cardiovascular study cohort, and we recover gene co-expression networks that are differential across ER+ and ER- samples and across male and female samples. We apply BicMix to the Genotype-Tissue Expression (GTEx) pilot data, and we find tissue specific gene networks. We validate these findings by using our tissue specific networks to identify trans-eQTLs specific to one of four primary tissues.},
  archivePrefix = {arXiv},
  author = {Gao, Chuan and McDowell, Ian C. and Zhao, Shiwen and Brown, Christopher D. and Engelhardt, Barbara E.},
  date = {2016},
  doi = {10/gbgphd},
  eprint = {27467526},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Gao et al. - 2016 - Context Specific and Differential Gene Co-expressi.pdf},
  issn = {15537358},
  journaltitle = {PLoS Computational Biology},
  number = {7},
  pages = {1-39},
  title = {Context {{Specific}} and {{Differential Gene Co}}-Expression {{Networks}} via {{Bayesian Biclustering}}},
  volume = {12}
}

@article{Gehring2007,
  abstract = {Cytosine bases are extensively methylated in the DNA of plant genomes. DNA methylation has been implicated in the silencing of transposable elements and genes, and loss of methylation can have severe consequences for the organism. The recent methylation profiling of the entire Arabidopsis genome has provided insight into the extent of DNA methylation and its functions in silencing and gene transcription. Patterns of DNA methylation are faithfully maintained across generations, but some changes in DNA methylation are observed in terminally differentiated tissues. Demethylation by a DNA glycosylase is required for the expression of imprinted genes in the endosperm and de novo methylation might play a role in the selective silencing of certain self-incompatibility alleles in the tapetum. Because DNA methylation patterns are faithfully inherited, changes in DNA methylation that arise somatically during the plant life cycle have the possibility of being propagated. Therefore, epimutations might be an important source of variation during plant evolution. \textcopyright{} 2007 Elsevier B.V. All rights reserved.},
  author = {Gehring, Mary and Henikoff, Steven},
  date = {2007-05},
  doi = {10/fskxsc},
  file = {/Users/ryan/Documents/Zotero Library/Gehring and Henikoff - 2007 - DNA methylation dynamics in plant genomes.pdf},
  issn = {01674781},
  journaltitle = {Biochimica et Biophysica Acta - Gene Structure and Expression},
  keywords = {Arabidopsis,Demethylation,Maize,Methylation,Transcription,Transposons},
  number = {5-6},
  pages = {276-286},
  title = {{{DNA}} Methylation Dynamics in Plant Genomes},
  volume = {1769}
}

@online{GenomeReferenceConsortium,
  author = {{Genome Reference Consortium}},
  keywords = {\#nosource},
  title = {Genome {{Reference Consortium Human Build}} 37 ({{GRCh37}})},
  url = {http://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.13/}
}

@article{Gentleman2004,
  abstract = {The Bioconductor project is an initiative for the collaborative creation of extensible software for computational biology and bioinformatics. The goals of the project include: fostering collaborative development and widespread use of innovative software, reducing barriers to entry into interdisciplinary scientific research, and promoting the achievement of remote reproducibility of research results. We describe details of our aims and methods, identify current challenges, compare Bioconductor to other open bioinformatics projects, and provide working examples.},
  author = {Gentleman, Robert C. and Carey, Vincent J. and Bates, Douglas M. and Bolstad, Ben and Dettling, Marcel and Dudoit, Sandrine and Ellis, Byron and Gautier, Laurent and Ge, Yongchao and Gentry, Jeff and Hornik, Kurt and Hothorn, Torsten and Huber, Wolfgang and Iacus, Stefano and Irizarry, Rafael and Leisch, Friedrich and Li, Cheng and Maechler, Martin and Rossini, Anthony J. and Sawitzki, Gunther and Smith, Colin and Smyth, Gordon and Tierney, Luke and Yang, Jean Y.H. and Zhang, Jianhua},
  date = {2004},
  doi = {10/c2xm5v},
  file = {/Users/ryan/Documents/Zotero Library/Gentleman et al. - 2004 - Bioconductor open software development for comput.pdf},
  issn = {14656914},
  journaltitle = {Genome biology},
  keywords = {Animal Genetics and Genomics,Bioinformatics,Evolutionary Biology,Human Genetics,Microbial Genetics and Genomics,Plant Genetics & Genomics},
  number = {10},
  pages = {R80},
  title = {Bioconductor: Open Software Development for Computational Biology and Bioinformatics.},
  volume = {5}
}

@collection{Gentleman2005,
  abstract = {Bioconductor is a widely used open source and open development software project for the analysis and comprehension of data arising from high-throughput experimentation in genomics and molecular biology. Bioconductor is rooted in the open source statistical computing environment R. This volume's coverage is broad and ranges across most of the key capabilities of the Bioconductor project, including

importation and preprocessing of high-throughput data from microarray, proteomic, and flow cytometry platforms

curation and delivery of biological metadata for use in statistical modeling and interpretation

statistical analysis of high-throughput data, including machine learning and visualization,

modeling and visualization of graphs and networks.

The developers of the software, who are in many cases leading academic researchers, jointly authored chapters. All methods are illustrated with publicly available data, and a major section of the book is devoted to exposition of fully worked case studies.

This book is more than a static collection of descriptive text, figures, and code examples that were run by the authors to produce the text; it is a dynamic document. Code underlying all of the computations that are shown is made available on a companion website, and readers can reproduce every number, figure, and table on their own computers.},
  date = {2005},
  doi = {10/brh8rt},
  editor = {Gentleman, Robert and Carey, Vincent J. and Huber, Wolfgang and Irizarry, Rafael A. and Dudoit, Sandrine},
  editorb = {Wong, Wing and Gail, M. and Krickeberg, K. and Tsiatis, A. and Samet, J.},
  editorbtype = {redactor},
  isbn = {978-0-387-25146-2 978-0-387-29362-2},
  keywords = {\#nosource},
  location = {{New York, NY}},
  publisher = {{Springer New York}},
  series = {Statistics for {{Biology}} and {{Health}}},
  title = {Bioinformatics and {{Computational Biology Solutions Using R}} and {{Bioconductor}}},
  url = {http://link.springer.com/10.1007/0-387-29362-0},
  urldate = {2019-11-15}
}

@article{Gerard2017,
  abstract = {We combine two important ideas in the analysis of large-scale genomics experiments (e.g. experiments that aim to identify genes that are differentially expressed between two conditions). The first is use of Empirical Bayes (EB) methods to handle the large number of potentially-sparse effects, and estimate false discovery rates and related quantities. The second is use of factor analysis methods to deal with sources of unwanted variation such as batch effects and unmeasured confounders. We describe a simple modular fitting procedure that combines key ideas from both these lines of research. This yields new, powerful EB methods for analyzing genomics experiments that account for both sparse effects and unwanted variation. In realistic simulations, these new methods provide significant gains in power and calibration over competing methods. In real data analysis, we find that different methods, while often conceptually similar, can vary widely in their assessments of statistical significance. This highlights the need for care in both choice of methods and interpretation of results.},
  author = {Gerard, David and Stephens, Matthew},
  date = {2018-07-06},
  doi = {10/ggcxhp},
  file = {/Users/ryan/Documents/Zotero Library/Gerard and Stephens - 2018 - Empirical Bayes shrinkage and false discovery rate.pdf},
  ids = {Gerard2017},
  issn = {1465-4644, 1468-4357},
  journaltitle = {Biostatistics},
  keywords = {and phrases,batch effects,rna-seq,surrogate variable,unobserved confounding,unwanted variation},
  langid = {english},
  title = {Empirical {{Bayes}} Shrinkage and False Discovery Rate Estimation, Allowing for Unwanted Variation}
}

@article{Gerard2017a,
  abstract = {Unwanted variation, including hidden confounding, is a well-known problem in many fields, particularly large-scale gene expression studies. Recent proposals to use control genes --- genes assumed to be unassociated with the covariates of interest --- have led to new methods to deal with this problem. Going by the moniker Removing Unwanted Variation (RUV), there are many versions --- RUV1, RUV2, RUV4, RUVinv, RUVrinv, RUVfun. In this paper, we introduce a general framework, RUV*, that both unites and generalizes these approaches. This unifying framework helps clarify connections between existing methods. In particular we provide conditions under which RUV2 and RUV4 are equivalent. The RUV* framework also preserves an advantage of RUV approaches --- their modularity --- which facilitates the development of novel methods based on existing matrix imputation algorithms. We illustrate this by implementing RUVB, a version of RUV* based on Bayesian factor analysis. In realistic simulations based on real data we found that RUVB is competitive with existing methods in terms of both power and calibration, although we also highlight the challenges of providing consistently reliable calibration among data sets.},
  archivePrefix = {arXiv},
  author = {Gerard, David and Stephens, Matthew},
  date = {2017},
  eprint = {1705.08393},
  eprinttype = {arxiv},
  file = {/Users/ryan/Documents/Zotero Library/Gerard and Stephens - 2017 - Unifying and Generalizing Methods for Removing Unw.pdf},
  keywords = {⛔ No DOI found,and phrases,batch effects,correlated tests,gene ex-,hidden confounding,negative controls,pression,rna-seq,unobserved confounding,unwanted variation},
  title = {Unifying and {{Generalizing Methods}} for {{Removing Unwanted Variation Based}} on {{Negative Controls}}},
  url = {http://arxiv.org/abs/1705.08393}
}

@article{Gevaert2015,
  abstract = {Summary: DNA methylation is an important mechanism regulating gene transcription, and its role in carcinogenesis has been extensively studied. Hyper and hypomethylation of genes is an alternative mechanism to deregulate gene expression in a wide range of diseases. At the same time, high-throughput DNA methylation assays have been developed generating vast amounts of genome wide DNA methylation measurements. Yet, few tools exist that can formally identify hypo and hypermethylated genes that are predictive of transcription and thus functionally relevant for a particular disease. To accommodate this lack of tools, we developed MethylMix, an algorithm implemented in R to identify disease specific hyper and hypomethylated genes. MethylMix is based on a beta mixture model to identify methylation states and compares them with the normal DNA methylation state. MethylMix introduces a novel metric, the `Differential Methylation value' or DM-value defined as the difference of a methylation state with the normal methylation state. Finally, matched gene expression data are used to identify, besides differential, transcriptionally predictive methylation states by focusing on methylation changes that effect gene expression.\textbackslash{}nAvailability and implementation: MethylMix was implemented as an R package and is available in bioconductor.\textbackslash{}nContact: olivier.gevaert@stanford.edu},
  author = {Gevaert, Olivier},
  date = {2015},
  doi = {10/f7gczb},
  eprint = {25609794},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Gevaert - 2015 - MethylMix An R package for identifying DNA methyl.pdf},
  isbn = {1367-4803, 1460-2059},
  issn = {14602059},
  journaltitle = {Bioinformatics},
  number = {11},
  pages = {1839-1841},
  title = {{{MethylMix}}: {{An R}} Package for Identifying {{DNA}} Methylation-Driven Genes},
  volume = {31}
}

@article{Gevaert2015a,
  abstract = {Aberrant DNA methylation is an important mechanism that contributes to oncogenesis. Yet, few algorithms exist that exploit this vast dataset to identify hypo- and hypermethylated genes in cancer. We developed a novel computational algorithm called MethylMix to identify differentially methylated genes that are also predictive of transcription. We apply MethylMix to 12 individual cancer sites, and additionally combine all cancer sites in a pancancer analysis. We discover pancancer hypo- and hypermethylated genes and identify novel methylation-driven subgroups with clinical implications. MethylMix analysis on combined cancer sites reveals 10 pancancer clusters reflecting new similarities across malignantly transformed tissues.},
  author = {Gevaert, Olivier and Tibshirani, Robert and Plevritis, Sylvia K},
  date = {2015},
  doi = {10/f66kv5},
  eprint = {25631659},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Gevaert et al. - 2015 - Pancancer analysis of DNA methylation-driven genes.pdf},
  isbn = {1465-6906},
  issn = {1474-760X},
  journaltitle = {Genome biology},
  number = {1},
  pages = {17},
  title = {Pancancer Analysis of {{DNA}} Methylation-Driven Genes Using {{MethylMix}}.},
  volume = {16}
}

@article{Gibbs2007,
  abstract = {The rhesus macaque (Macaca mulatta) is an abundant primate species that diverged from the ancestors of Homo sapiens about 25 million years ago. Because they are genetically and physiologically similar to humans, rhesus monkeys are the most widely used nonhuman primate in basic and applied biomedical research. We determined the genome sequence of an Indian-origin Macaca mulatta female and compared the data with chimpanzees and humans to reveal the structure of ancestral primate genomes and to identify evidence for positive selection and lineage-specific expansions and contractions of gene families. A comparison of sequences from individual animals was used to investigate their underlying genetic diversity. The complete description of the macaque genome blueprint enhances the utility of this animal model for biomedical research and improves our understanding of the basic biology of the species.},
  author = {a Gibbs, Richard and Rogers, J. and Katze, Michael G and Bumgarner, Roger and Weinstock, George M and Mardis, Elaine R and a Remington, Karin and Strausberg, Robert L and Venter, J. C. and Wilson, Richard K and a Batzer, Mark and Bustamante, Carlos D and Eichler, Evan E and Hahn, Matthew W and Hardison, Ross C and Makova, Kateryna D and Miller, Webb and Milosavljevic, Aleksandar and Palermo, Robert E and Siepel, Adam and Sikela, James M and Attaway, Tony and Bell, Stephanie and Bernard, Kelly E and Buhay, Christian J and Chandrabose, Mimi N and Dao, Marvin and Davis, Clay and Delehaunty, Kimberly D and Ding, Yan and Dinh, Huyen H and Dugan-Rocha, Shannon and a Fulton, Lucinda and Gabisi, Ramatu Ayiesha and Garner, Toni T and Godfrey, Jennifer and Hawes, Alicia C and Hernandez, J. and Hines, Sandra and Holder, Michael and Hume, Jennifer and Jhangiani, Shalini N and Joshi, Vandita and Khan, Ziad Mohid and Kirkness, Ewen F and Cree, Andrew and Fowler, R Gerald and Lee, S. and Lewis, Lora R and Li, Zhangwan and Liu, Y.-s. and Moore, Stephanie M and Muzny, Donna and Nazareth, Lynne V and Ngo, Dinh Ngoc and Okwuonu, Geoffrey O and Pai, Grace and Parker, David and a Paul, Heidie and Pfannkoch, Cynthia and Pohl, Craig S and Rogers, Y.-H. and Ruiz, San Juana and Sabo, Aniko and Santibanez, Jireh and Schneider, Brian W and Smith, S. M. and Sodergren, Erica and Svatek, Amanda F and Utterback, Teresa R and Vattathil, Selina and Warren, Wesley and White, Courtney Sherell and Chinwalla, Asif T and Feng, Yucheng and Halpern, Aaron L and Hillier, Ladeana W and Huang, Xiaoqiu and Minx, Pat and Nelson, J. O. and Pepin, Kymberlie H and Qin, Xiang and Sutton, Granger G and Venter, Eli and Walenz, Brian P and Wallis, John W and Worley, Kim C and Yang, S.-P. and Jones, Steven M and a Marra, Marco and Rocchi, Mariano and Schein, Jacqueline E and Baertsch, Robert and Clarke, Laura and Csuros, M. and Glasscock, Jarret and Harris, R Alan and Havlak, Paul and Jackson, Andrew R and Jiang, Huaiyang and Liu, Y. and Messina, David N and Shen, Yufeng and Song, H. X.-Z. and Wylie, Todd and Zhang, Lan and Birney, Ewan and Han, K. and Konkel, Miriam K and Lee, Jungnam and a Smit, Arian F and Ullmer, Brygg and Wang, H. and Xing, Jinchuan and Burhans, Richard and Cheng, Ze and Karro, John E and Ma, Jian and Raney, Brian and She, Xinwei and Cox, Michael J and Demuth, Jeffery P and Dumas, Laura J and Han, S.-G. and Hopkins, Janet and Karimpour-Fard, Anis and Kim, Young H and Pollack, Jonathan R and Vinar, Tomas and Addo-Quaye, Charles and Degenhardt, Jeremiah and Denby, Alexandra and Hubisz, Melissa J and Indap, Amit and Kosiol, Carolin and Lahn, Bruce T and a Lawson, Heather and Marklein, Alison and Nielsen, Rasmus and Vallender, Eric J and Clark, Andrew G and Ferguson, Betsy and Hernandez, Ryan D and Hirani, Kashif and Kehrer-Sawatzki, Hildegard and Kolb, Jessica and Patil, Shobha and Pu, L.-L. and Ren, Yanru and Smith, D. G. and a Wheeler, David and Schenck, Ian and Ball, Edward V and Chen, Rui and Cooper, David N and Giardine, Belinda and Hsu, Fan and Kent, W James and Lesk, Arthur and Nelson, David L and O'Brien, W. E. and Prufer, K. and Stenson, Peter D and Wallace, James C and Ke, Hui and Liu, X.-M. and Wang, Peng and Xiang, Andy Peng and Yang, Fan and Barber, Galt P and Haussler, David and Karolchik, Donna and Kern, Andy D and Kuhn, Robert M and Smith, Kayla E and Zwieg, Ann S},
  date = {2007-04-13},
  doi = {10/b8r2vs},
  eprint = {17431167},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Gibbs et al. - 2007 - Evolutionary and Biomedical Insights from the Rhes.pdf;/Users/ryan/Documents/Zotero Library/Gibbs et al. - 2007 - Evolutionary and Biomedical Insights from the Rhes2.pdf},
  isbn = {1377298973},
  issn = {0036-8075},
  journaltitle = {Science},
  keywords = {Animals,Biomedical Research,DNA,Evolution,Female,Gene Duplication,Gene Rearrangement,Genetic Diseases,Genetic Variation,Genome,Humans,Inborn,Macaca mulatta,Macaca mulatta: genetics,Male,Molecular,Multigene Family,Mutation,Pan troglodytes,Pan troglodytes: genetics,Sequence Analysis,Species Specificity},
  number = {5822},
  pages = {222-234},
  title = {Evolutionary and {{Biomedical Insights}} from the {{Rhesus Macaque Genome}}},
  volume = {316}
}

@article{Gillespie2010,
  abstract = {Large scale microarray experiments are becoming increasingly routine, particularly those which track a number of different cell lines through time. This time-course information provides valuable insight into the dynamic mechanisms underlying the biological processes being observed. However, proper statistical analysis of time-course data requires the use of more sophisticated tools and complex statistical models.},
  author = {Gillespie, Colin S and Lei, Guiyuan and Boys, Richard J and Greenall, Amanda and Wilkinson, Darren J},
  date = {2010-01},
  doi = {10/btj4tg},
  eprint = {20302631},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Gillespie et al. - 2010 - Analysing time course microarray data using Biocon.pdf},
  issn = {1756-0500},
  journaltitle = {BMC research notes},
  pages = {81},
  title = {Analysing Time Course Microarray Data Using {{Bioconductor}}: A Case Study Using Yeast2 {{Affymetrix}} Arrays.},
  volume = {3}
}

@misc{Girke2012,
  abstract = {Advanced R/Bioconductor Workshop on High-Throughput Genetic Analysis},
  author = {Girke, Thomas (Fred Hutchinson Cancer Research Center)},
  date = {2012},
  file = {/Users/ryan/Documents/Zotero Library/Girke - 2012 - ChIP-Seq Analysis with R and Bioconductor.pdf},
  keywords = {presentation,tutorial},
  title = {{{ChIP}}-{{Seq Analysis}} with {{R}} and {{Bioconductor}}}
}

@unpublished{globin-reduction,
  abstract = {Primate blood contains high concentrations of globin messenger RNA. Globin reduction is a standard technique used to improve the expression results obtained by DNA microarrays on RNA from blood samples. However, with whole transcriptome RNA-sequencing (RNA-seq) quickly replacing microarrays for many applications, the impact of globin reduction for RNA-seq has not been previously studied. Moreover, no off-the-shelf kits are available for globin reduction in nonhuman primates. Here we report a protocol for RNA-seq in primate blood samples that uses complimentary oligonucleotides to block reverse transcription of the alpha and beta globin genes. In test samples from cynomolgus monkeys (Macaca fascicularis), this globin blocking protocol approximately doubles the yield of informative (non-globin) reads by greatly reducing the fraction of globin reads, while also improving the consistency in sequencing depth between samples. The increased yield enables detection of about 2000 more genes, significantly increases the correlation in measured gene expression levels between samples, and increases the sensitivity of differential gene expression tests. These results show that globin blocking significantly improves the cost-effectiveness of mRNA sequencing in primate blood samples by doubling the yield of useful reads, allowing detection of more genes, and improving the precision of gene expression measurements. Based on these results, a globin reducing or blocking protocol is recommended for all RNA-seq studies of primate blood samples.},
  author = {Thompson, Ryan C. and Gelbart, Terri and Head, Steven R and Ordoukhanian, Phillip and Mullen, Courtney and Han, Dongmei and Berman, Dora M and Bartholomew, Amelia and Kenyon, Norma S and Salomon, Daniel R},
  date = {2019},
  keywords = {\#nosource},
  location = {{La Jolla, CA}},
  note = {Institution: The Scripps Research Institute},
  title = {Optimizing Yield of Deep {{RNA}} Sequencing for Gene Expression Profiling of Peripheral Blood Samples from Cynomolgus Monkeys ({{Macaca}} Fascicularis). ({{In}} Preparation)}
}

@article{Goecks2010,
  abstract = {Increased reliance on computational approaches in the life sciences has revealed grave concerns about how accessible and reproducible computation-reliant results truly are. Galaxy http://usegalaxy.org, an open web-based platform for genomic research, addresses these problems. Galaxy automatically tracks and manages data provenance and provides support for capturing the context and intent of computational methods. Galaxy Pages are interactive, web-based documents that provide users with a medium to communicate a complete computational analysis.},
  author = {Goecks, Jeremy and Nekrutenko, Anton and Taylor, James},
  date = {2010-01},
  doi = {10/bfkf6s},
  eprint = {20738864},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Goecks et al. - 2010 - Galaxy a comprehensive approach for supporting ac.pdf},
  issn = {1465-6914},
  journaltitle = {Genome biology},
  keywords = {Algorithms,Animals,Computational Biology,Computational Biology: methods,Databases; Nucleic Acid,Genomics,Genomics: methods,Humans,Internet},
  number = {8},
  pages = {R86},
  title = {Galaxy: A Comprehensive Approach for Supporting Accessible, Reproducible, and Transparent Computational Research in the Life Sciences.},
  volume = {11}
}

@article{Gomes2014,
  abstract = {The comprehension of protein and DNA binding in vivo is essential to understand gene regulation. Chromatin immunoprecipitation followed by sequencing (ChIP-seq) provides a global map of the regulatory binding network. Most ChIP-seq analysis tools focus on identifying binding regions from coverage enrichment. However, less work has been performed to infer the physical and regulatory details inside the enriched regions. This research extends a previous blind-deconvolution approach to develop a post-peak-calling algorithm that improves binding site resolution and predicts cooperative interactions. At the core of our new method is a physically motivated model that characterizes the binding signal as an extreme value distribution. This model suggests a mathematical framework to study physical properties of DNA shearing from the ChIP-seq coverage. The model explains the ChIP-seq coverage with two signals: The first considers DNA fragments with only a single binding event, whereas the second considers fragments with two binding events (a double-binding signal). The model incorporates motif discovery and is able to detect multiple sites in an enriched region with single-nucleotide resolution, high sensitivity, and high specificity. Our method improves peak caller sensitivity, from less than 45\% up to 94\%, at a false positive rate {$<$} 11\% for a set of 47 experimentally validated prokaryotic sites. It also improves resolution of highly enriched regions of large-scale eukaryotic data sets. The double-binding signal provides a novel application in ChIP-seq analysis: the identification of cooperative interaction. Predictions of known cooperative binding sites show a 0.85 area under an ROC curve.},
  author = {Gomes, Antonio L C and Abeel, Thomas and Peterson, Matthew and Azizi, Elham and Lyubetskaya, Anna and Carvalho, Lu\'is and Galagan, James},
  date = {2014-10},
  doi = {10/f6jxvp},
  eprint = {25024162},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Gomes et al. - 2014 - Decoding ChIP-seq with a double-binding signal ref.pdf},
  issn = {1549-5469},
  journaltitle = {Genome research},
  number = {10},
  pages = {1686-97},
  title = {Decoding {{ChIP}}-Seq with a Double-Binding Signal Refines Binding Peaks to Single-Nucleotides and Predicts Cooperative Interaction.},
  volume = {24}
}

@software{gordbrownGreyListChIPGreyLists2019,
  abstract = {Identify regions of ChIP experiments with high signal in the input, that lead to spurious peaks during peak calling. Remove reads aligning to these regions prior to peak calling, for cleaner ChIP analysis.},
  author = {{Gord Brown}},
  date = {2019},
  doi = {10/dd8m},
  ids = {bioc-greylistchip},
  keywords = {Alignment,ChIPSeq,Coverage,DifferentialPeakCalling,GenomeAnnotation,Preprocessing,Sequencing,Software},
  organization = {{Bioconductor version: Release (3.10)}},
  shorttitle = {{{GreyListChIP}}},
  title = {{{GreyListChIP}}: {{Grey Lists}} -- {{Mask Artefact Regions Based}} on {{ChIP Inputs}}},
  url = {https://bioconductor.org/packages/GreyListChIP/},
  urldate = {2019-11-14},
  version = {1.18.0}
}

@article{Grant2011,
  abstract = {A critical task in high-throughput sequencing is aligning millions of short reads to a reference genome. Alignment is especially complicated for RNA sequencing (RNA-Seq) because of RNA splicing. A number of RNA-Seq algorithms are available, and claim to align reads with high accuracy and efficiency while detecting splice junctions. RNA-Seq data are discrete in nature; therefore, with reasonable gene models and comparative metrics RNA-Seq data can be simulated to sufficient accuracy to enable meaningful benchmarking of alignment algorithms. The exercise to rigorously compare all viable published RNA-Seq algorithms has not been performed previously.},
  author = {Grant, Gregory R and Farkas, Michael H and Pizarro, Angel D and Lahens, Nicholas F and Schug, Jonathan and Brunk, Brian P and Stoeckert, Christian J and Hogenesch, John B and a Pierce, Eric},
  date = {2011-09-15},
  doi = {10/cz9295},
  eprint = {21775302},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Grant et al. - 2011 - Comparative analysis of RNA-Seq alignment algorith.pdf},
  issn = {1367-4811},
  journaltitle = {Bioinformatics (Oxford, England)},
  keywords = {Algorithms,Animals,Base Sequence,Benchmarking,Cluster Analysis,Exons,Gene Library,Genome,High-Throughput Nucleotide Sequencing,Mice,Models; Genetic,Molecular Sequence Data,RNA,RNA Splicing,RNA: genetics,Sequence Alignment,Sequence Analysis; RNA,Sequence Analysis; RNA: methods,Software},
  number = {18},
  pages = {2518-28},
  title = {Comparative Analysis of {{RNA}}-{{Seq}} Alignment Algorithms and the {{RNA}}-{{Seq}} Unified Mapper ({{RUM}}).},
  volume = {27}
}

@article{Grossmann2007a,
  abstract = {MOTIVATION: High-throughput experiments such as microarray hybridizations often yield long lists of genes found to share a certain characteristic such as differential expression. Exploring Gene Ontology (GO) annotations for such lists of genes has become a widespread practice to get first insights into the potential biological meaning of the experiment. The standard statistical approach to measuring overrepresentation of GO terms cannot cope with the dependencies resulting from the structure of GO because they analyze each term in isolation. Especially the fact that annotations are inherited from more specific descendant terms can result in certain types of false-positive results with potentially misleading biological interpretation, a phenomenon which we term the inheritance problem.

RESULTS: We present here a novel approach to analysis of GO term overrepresentation that determines overrepresentation of terms in the context of annotations to the term's parents. This approach reduces the dependencies between the individual term's measurements, and thereby avoids producing false-positive results owing to the inheritance problem. ROC analysis using study sets with overrepresented GO terms showed a clear advantage for our approach over the standard algorithm with respect to the inheritance problem. Although there can be no gold standard for exploratory methods such as analysis of GO term overrepresentation, analysis of biological datasets suggests that our algorithm tends to identify the core GO terms that are most characteristic of the dataset being analyzed.},
  author = {Grossmann, Steffen and Bauer, Sebastian and Robinson, Peter N and Vingron, Martin},
  date = {2007-11-15},
  doi = {10/df3392},
  eprint = {17848398},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Grossmann et al. - 2007 - Improved detection of overrepresentation of Gene-O.pdf},
  issn = {1367-4811},
  journaltitle = {Bioinformatics (Oxford, England)},
  keywords = {Algorithms,Gene Expression Profiling,Gene Expression Profiling: methods,Genes,Genes: genetics,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,Phylogeny,Reproducibility of Results,Sensitivity and Specificity},
  number = {22},
  pages = {3024-31},
  title = {Improved Detection of Overrepresentation of {{Gene}}-{{Ontology}} Annotations with Parent Child Analysis.},
  volume = {23}
}

@article{Gunady2018,
  abstract = {Introduction: Analysis of differential alternative splicing from RNA-seq data is complicated by the fact that many RNA-seq reads map to multiple transcripts, besides, the annotated transcripts are often a small subset of the possible transcripts of a gene. Here we describe Yanagi, a tool for segmenting transcriptome to create a library of maximal L-disjoint segments from a complete transcriptome annotation. That segment library preserves all transcriptome substrings of length L and transcripts structural relationships while eliminating unnecessary sequence duplications.

Contributions: In this paper, we formalize the concept of transcriptome segmentation and propose an efficient algorithm for generating segment libraries based on a length parameter dependent on specific RNA-Seq library construction. The resulting segment sequences can be used with pseudo-alignment tools to quantify expression at the segment level. We characterize the segment libraries for the reference transcriptomes of Drosophila melanogaster and Homo sapiens and provide gene-level visualization of the segments for better interpretability. Then we demonstrate the use of segments-level quantification into gene expression and alternative splicing analysis. The notion of transcript segmentation as introduced here and implemented in Yanagi opens the door for the application of lightweight, ultra-fast pseudo-alignment algorithms in a wide variety of RNA-seq analyses.

Conclusion: Using segment library rather than the standard transcriptome succeeds in significantly reducing ambigious alignments where reads are multimapped to several sequences in the reference. That allowed avoiding the quantification step required by standard kmer-based pipelines for gene expression analysis. Moreover, using segment counts as statistics for alternative splicing analysis enables achieving comparable performance to counting-based approaches (e.g. rMATS) while rather using fast and lighthweight pseudo alignment.},
  author = {MK, Gunady and Mount, S and H, Corrada Bravo},
  date = {2018},
  doi = {10/ggcxk4},
  file = {/Users/ryan/Documents/Zotero Library/MK et al. - 2018 - Fast and interpretable alternative splicing and di.pdf},
  journaltitle = {bioRxiv Bioinformatics},
  pages = {1-23},
  title = {Fast and Interpretable Alternative Splicing and Differential Gene-Level Expression Analysis Using Transcriptome Segmentation with {{Yanagi}}}
}

@article{Guo2007,
  abstract = {In this paper, we introduce a modified version of linear discriminant analysis, called the "shrunken centroids regularized discriminant analysis" (SCRDA). This method generalizes the idea of the "nearest shrunken centroids" (NSC) (Tibshirani and others, 2003) into the classical discriminant analysis. The SCRDA method is specially designed for classification problems in high dimension low sample size situations, for example, microarray data. Through both simulated data and real life data, it is shown that this method performs very well in multivariate classification problems, often outperforms the PAM method (using the NSC algorithm) and can be as competitive as the support vector machines classifiers. It is also suitable for feature elimination purpose and can be used as gene selection method. The open source R package for this method (named "rda") is available on CRAN (http://www.r-project.org) for download and testing.},
  author = {Guo, Yaqian and Hastie, Trevor and Tibshirani, Robert},
  date = {2007-01},
  doi = {10/bnc4tx},
  eprint = {16603682},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Guo et al. - 2007 - Regularized linear discriminant analysis and its a.pdf},
  issn = {1465-4644},
  journaltitle = {Biostatistics (Oxford, England)},
  keywords = {Computer Simulation,Discriminant Analysis,DNA; Neoplasm,DNA; Neoplasm: genetics,Gene Expression Profiling,Gene Expression Profiling: methods,Humans,Linear Models,Neoplasms,Neoplasms: classification,Neoplasms: genetics,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods},
  number = {1},
  pages = {86-100},
  title = {Regularized Linear Discriminant Analysis and Its Application in Microarrays.},
  volume = {8}
}

@article{Guo2010,
  abstract = {BACKGROUND: data generated using 'omics' technologies are characterized by high dimensionality, where the number of features measured per subject vastly exceeds the number of subjects in the study. In this paper, we consider issues relevant in the design of biomedical studies in which the goal is the discovery of a subset of features and an associated algorithm that can predict a binary outcome, such as disease status. We compare the performance of four commonly used classifiers (K-Nearest Neighbors, Prediction Analysis for Microarrays, Random Forests and Support Vector Machines) in high-dimensionality data settings. We evaluate the effects of varying levels of signal-to-noise ratio in the dataset, imbalance in class distribution and choice of metric for quantifying performance of the classifier. To guide study design, we present a summary of the key characteristics of 'omics' data profiled in several human or animal model experiments utilizing high-content mass spectrometry and multiplexed immunoassay based techniques.

RESULTS: the analysis of data from seven 'omics' studies revealed that the average magnitude of effect size observed in human studies was markedly lower when compared to that in animal studies. The data measured in human studies were characterized by higher biological variation and the presence of outliers. The results from simulation studies indicated that the classifier Prediction Analysis for Microarrays (PAM) had the highest power when the class conditional feature distributions were Gaussian and outcome distributions were balanced. Random Forests was optimal when feature distributions were skewed and when class distributions were unbalanced. We provide a free open-source R statistical software library (MVpower) that implements the simulation strategy proposed in this paper.

CONCLUSION: no single classifier had optimal performance under all settings. Simulation studies provide useful guidance for the design of biomedical studies involving high-dimensionality data.},
  author = {Guo, Yu and Graber, Armin and McBurney, Robert N and Balasubramanian, Raji},
  date = {2010-01},
  doi = {10/b245tt},
  eprint = {20815881},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Guo et al. - 2010 - Sample size and statistical power considerations i.pdf},
  issn = {1471-2105},
  journaltitle = {BMC bioinformatics},
  keywords = {Algorithms,Animals,Classification,Classification: methods,Databases; Factual,Gene Expression Profiling,Gene Expression Profiling: methods,Humans,Models; Statistical,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,Pattern Recognition; Automated,Sample Size},
  pages = {447},
  title = {Sample Size and Statistical Power Considerations in High-Dimensionality Data Settings: A Comparative Study of Classification Algorithms.},
  volume = {11}
}

@article{Hachiya2009,
  abstract = {A conserved gene cluster (also referred to as a conserved gene order) is defined as a cluster of neighboring genes whose gene order is conserved across several species. In the present study, we propose a novel workflow which enables sensitive detection of conserved gene clusters by taking into account the information of gene order conservation in the step to identify orthologous genes (OGs). Our workflow was applied to large-scale comparisons of 101 prokaryotic and 15 fungal genomes. Thereafter, we examined the difference between OGs in conserved gene clusters (clustered OGs) and OGs that are not the members of conserved gene clusters (isolated OGs). Our analysis confirms the finding in previous studies that, in prokaryotes, protein sequences of clustered OGs are more conserved than those of isolated OGs. In addition, this interesting correlation between protein sequence homology and gene order conservation were observed also in fungal genomes. To our knowledge, this is the first report of a systematic survey of such correlation in eukaryotic genomes. Furthermore, we analyzed evolutionary forces behind the correlation by estimating the rate of synonymous substitutions (K S) and the rate of nonsynonymous substitutions (K A). This detailed sequence analysis reveals that although the correlation is consistently observed and seems to be a general trend among prokaryotic and fungal genomes, the evolutionary forces behind the correlation are different among lineages, suggesting that the joint effect of heterogeneous underlying mechanisms would result in the correlation.},
  author = {Hachiya, Tsuyoshi and Sakakibara, Yasubumi},
  date = {2009-01-01},
  file = {/Users/ryan/Documents/Zotero Library/Hachiya and Sakakibara - 2009 - Sensitive Detection of Conserved Gene Clusters Unr.pdf},
  ids = {Hachiya2009},
  journaltitle = {Genes, Genomes and Genomics},
  keywords = {⛔ No DOI found,abbreviation,comparative genomics,conserved gene cluster,genome organization,og,ortholog,orthologous gene,substitution rate},
  shortjournal = {Genes, Genomes and Genomics},
  title = {Sensitive {{Detection}} of {{Conserved Gene Clusters Unravels}} the {{Evolutionary Forces}} behind the {{Correlation}} between {{Protein Sequence Homology}} and {{Gene Order Conservation}}},
  volume = {3}
}

@article{Hachiya2009a,
  abstract = {MOTIVATION: The accurate detection of orthologous segments (also referred to as syntenic segments) plays a key role in comparative genomics, as it is useful for inferring genome rearrangement scenarios and computing whole-genome alignments. Although a number of algorithms for detecting orthologous segments have been proposed, none of them contain a framework for optimizing their parameter values. METHODS: In the present study, we propose an algorithm, named OSfinder (Orthologous Segment finder), which uses a novel scoring scheme based on stochastic models. OSfinder takes as input the positions of short homologous regions (also referred to as anchors) and explicitly discriminates orthologous anchors from non-orthologous anchors by using Markov chain models which represent respective geometric distributions of lengths of orthologous and non-orthologous anchors. Such stochastic modeling makes it possible to optimize parameter values by maximizing the likelihood of the input dataset, and to automate the setting of the optimal parameter values. RESULTS: We validated the accuracies of orthology-mapping algorithms on the basis of their consistency with the orthology annotation of genes. Our evaluation tests using mammalian and bacterial genomes demonstrated that OSfinder shows higher accuracy than previous algorithms. AVAILABILITY: The OSfinder software was implemented as a C++ program. The software is freely available at http://osfinder.dna.bio.keio.ac.jp under the GNU General Public License. SUPPLEMENTARY INFORMATION: Supplementary data are available at Bioinformatics online.},
  author = {Hachiya, Tsuyoshi and Osana, Yasunori and Popendorf, Kris and Sakakibara, Yasubumi},
  date = {2009},
  doi = {10/fjjfsv},
  eprint = {19188192},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Hachiya et al. - 2009 - Accurate identification of orthologous segments am.pdf},
  isbn = {1367-4811 (Electronic)\textbackslash{}n1367-4803 (Linking)},
  issn = {13674803},
  journaltitle = {Bioinformatics},
  number = {7},
  pages = {853-860},
  title = {Accurate Identification of Orthologous Segments among Multiple Genomes},
  volume = {25}
}

@article{Hall2018,
  abstract = {Diabetes is an increasing problem worldwide; almost 30 million people, nearly 10\% of the population, in the United States are diagnosed with diabetes. Another 84 million are prediabetic, and without intervention, up to 70\% of these individuals may progress to type 2 diabetes. Current methods for quantifying blood glucose dysregulation in diabetes and prediabetes are limited by reliance on single-time-point measurements or on average measures of overall glycemia and neglect glucose dynamics. We have used continuous glucose monitoring (CGM) to evaluate the frequency with which individuals demonstrate elevations in postprandial glucose, the types of patterns, and how patterns vary between individuals given an identical nutrient challenge. Measurement of insulin resistance and secretion highlights the fact that the physiology underlying dysglycemia is highly variable between individuals. We developed an analytical framework that can group individuals according to specific patterns of glycemic responses called ``glucotypes'' that reveal heterogeneity, or subphenotypes, within traditional diagnostic categories of glucose regulation. Importantly, we found that even individuals considered normoglycemic by standard measures exhibit high glucose variability using CGM, with glucose levels reaching prediabetic and diabetic ranges 15\% and 2\% of the time, respectively. We thus show that glucose dysregulation, as characterized by CGM, is more prevalent and heterogeneous than previously thought and can affect individuals considered normoglycemic by standard measures, and specific patterns of glycemic responses reflect variable underlying physiology. The interindividual variability in glycemic responses to standardized meals also highlights the personal nature of glucose regulation. Through extensive phenotyping, we developed a model for identifying potential mechanisms of personal glucose dysregulation and built a webtool for visualizing a user-uploaded CGM profile and classifying individualized glucose patterns into glucotypes.},
  author = {Hall, Heather and Perelman, Dalia and Breschi, Alessandra and Limcaoco, Patricia and Kellogg, Ryan and McLaughlin, Tracey and Snyder, Michael},
  date = {2018},
  doi = {10/gdwxrm},
  file = {/Users/ryan/Documents/Zotero Library/Hall et al. - 2018 - Glucotypes reveal new patterns of glucose dysregul.pdf},
  isbn = {1111111111},
  issn = {15457885},
  journaltitle = {PLoS Biology},
  number = {7},
  pages = {1-23},
  title = {Glucotypes Reveal New Patterns of Glucose Dysregulation},
  volume = {16}
}

@article{Hamfjord2012,
  abstract = {We present the results of a global study of dysregulated miRNAs in paired samples of normal mucosa and tumor from eight patients with colorectal cancer. Although there is existing data of miRNA contribution to colorectal tumorigenesis, these studies are typically small to medium scale studies of cell lines or non-paired tumor samples. The present study is to our knowledge unique in two respects. Firstly, the normal and adjacent tumor tissue samples are paired, thus taking into account the baseline differences between individuals when testing for differential expression. Secondly, we use high-throughput sequencing, thus enabling a comprehensive survey of all miRNAs expressed in the tissues. We use Illumina sequencing technology to perform sequencing and two different tools to statistically test for differences in read counts per gene between samples: edgeR when using the pair information and DESeq when ignoring this information, i.e., treating tumor and normal samples as independent groups. We identify 37 miRNAs that are significantly dysregulated in both statistical approaches, 19 down-regulated and 18 up-regulated. Some of these miRNAs are previously published as potential regulators in colorectal adenocarcinomas such as miR-1, miR-96 and miR-145. Our comprehensive survey of differentially expressed miRNAs thus confirms some existing findings. We have also discovered 16 dysregulated miRNAs, which to our knowledge have not previously been associated with colorectal carcinogenesis: the following significantly down-regulated miR-490-3p, -628-3p/-5p, -1297, -3151, -3163, -3622a-5p, -3656 and the up-regulated miR-105, -549, -1269, -1827, -3144-3p, -3177, -3180-3p, -4326. Although the study is preliminary with only eight patients included, we believe the results add to the present knowledge on miRNA dysregulation in colorectal carcinogenesis. As such the results would serve as a robust training set for validation of potential biomarkers in a larger cohort study. Finally, we also present data supporting the hypothesis that there are differences in miRNA expression between adenocarcinomas and neuroendocrine tumors of the colon.},
  author = {Hamfjord, Julian and Stangeland, Astrid M and Hughes, Timothy and Skrede, Martina L and Tveit, Kjell M and Ikdahl, Tone and Kure, Elin H},
  date = {2012-01},
  doi = {10/f3w366},
  eprint = {22529906},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Hamfjord et al. - 2012 - Differential expression of miRNAs in colorectal ca.pdf},
  issn = {1932-6203},
  journaltitle = {PloS one},
  number = {4},
  pages = {e34150},
  title = {Differential Expression of {{miRNAs}} in Colorectal Cancer: Comparison of Paired Tumor Tissue and Adjacent Normal Mucosa Using High-Throughput Sequencing.},
  volume = {7}
}

@article{Han2014,
  author = {Han, Pei and Li, Wei and Lin, Chiou-Hong and Yang, Jin and Shang, Ching and Nurnberg, Sylvia T. and Jin, Kevin Kai and Xu, Weihong and Lin, Chieh-Yu and Lin, Chien-jung and Xiong, Yiqin and Chien, Huan-chieh and Zhou, Bin and Ashley, Euan and Bernstein, Daniel and Chen, Peng-Sheng and Chen, Huei-sheng Vincent and Quertermous, Thomas and Chang, Ching-pin},
  date = {2014-08-10},
  doi = {10/f6h4k2},
  file = {/Users/ryan/Documents/Zotero Library/Han et al. - 2014 - A long noncoding RNA protects the heart from patho.pdf},
  issn = {0028-0836},
  journaltitle = {Nature},
  number = {7520},
  pages = {102-106},
  title = {A Long Noncoding {{RNA}} Protects the Heart from Pathological Hypertrophy},
  volume = {514}
}

@article{Hansen2011,
  abstract = {The ability to measure gene expression on a genome-wide scale is one of the most promising accomplishments in molecular biology. Microarrays, the technology that first permitted this, were riddled with problems due to unwanted sources of variability. Many of these problems are now mitigated, after a decade's worth of statistical methodology development. The recently developed RNA sequencing (RNA-seq) technology has generated much excitement in part due to claims of reduced variability in comparison to microarrays. However, we show that RNA-seq data demonstrate unwanted and obscuring variability similar to what was first observed in microarrays. In particular, we find guanine-cytosine content (GC-content) has a strong sample-specific effect on gene expression measurements that, if left uncorrected, leads to false positives in downstream results. We also report on commonly observed data distortions that demonstrate the need for data normalization. Here, we describe a statistical methodology that improves precision by 42\% without loss of accuracy. Our resulting conditional quantile normalization algorithm combines robust generalized regression to remove systematic bias introduced by deterministic features such as GC-content and quantile normalization to correct for global distortions.},
  author = {Hansen, Kasper D and Irizarry, Rafael A and Wu, Zhijin},
  date = {2012-04},
  doi = {10/fzj4vk},
  eprint = {22285995},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Hansen et al. - 2012 - Removing technical variability in RNA-seq data usi.pdf},
  issn = {1468-4357},
  journaltitle = {Biostatistics (Oxford, England)},
  keywords = {Algorithms,Analysis of Variance,Base Composition,Biostatistics,Databases,Gene Expression Profiling,Gene Expression Profiling: statistics & numerical,High-Throughput Nucleotide Sequencing,High-Throughput Nucleotide Sequencing: statistics,Humans,Nucleic Acid,Nucleic Acid: statistics & numerical data,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: statistic,RNA,RNA: statistics & numerical data,Sequence Analysis},
  number = {2},
  pages = {204-16},
  title = {Removing Technical Variability in {{RNA}}-Seq Data Using Conditional Quantile Normalization.},
  volume = {13}
}

@article{Hara2000,
  abstract = {An immediate-early, transiently activated wound-responsive gene was identified in tobacco by fluorescent differential display screening. The full-length cDNA encodes a polypeptide of 356 amino acids with a relative molecular mass of 39,082 Da. The deduced amino acid sequence shows two characteristic features; a leucine-zipper motif found in the more N-terminal region and a WRKY domain containing a zinc-finger motif located in the central region. The gene was designated as wizz (wound-induced leucine zipper zinc finger). Northern analysis showed that upon wounding wizz transcripts were locally and systemically accumulated within 10 min, reached a maximum level by 30 min, and decreased thereafter to the basal level. Analyses; of a WIZZ-GFP fusion protein clearly indicated that WIZZ is a nuclear factor. WIZZ specifically binds to sequences containing two TTGAC core motifs that are separated by a spacer of appropriate length. The binding activity was dependent on bivalent cations, most probably zinc. In transient reporter assays, however, WIZZ did not show transactivation activity in tobacco suspension cells, suggesting that it functions together with other components. The results indicate that WIZZ is a new transcription factor which participates in early stages of the wound response.},
  author = {Hara, K. and Yagi, M. and Kusano, T. and Sano, H.},
  date = {2000-02-23},
  doi = {10/ddshq3},
  file = {/Users/ryan/Documents/Zotero Library/Hara et al. - 2000 - Rapid systemic accumulation of transcripts encodin.pdf},
  issn = {00268925},
  journaltitle = {Molecular and General Genetics},
  keywords = {Fluorescent differential display,Tobacco,Transcription factor,Wounding,WRKY family},
  number = {1},
  pages = {30-37},
  title = {Rapid Systemic Accumulation of Transcripts Encoding a Tobacco {{WRKY}} Transcription Factor upon Wounding},
  volume = {263}
}

@article{Hardcastle2010,
  abstract = {High throughput sequencing has become an important technology for studying expression levels in many types of genomic, and particularly transcriptomic, data. One key way of analysing such data is to look for elements of the data which display particular patterns of differential expression in order to take these forward for further analysis and validation.},
  author = {Hardcastle, Thomas J and a Kelly, Krystyna},
  date = {2010-01},
  doi = {10/fh29rb},
  eprint = {20698981},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Hardcastle and Kelly - 2010 - baySeq empirical Bayesian methods for identifying.pdf},
  issn = {1471-2105},
  journaltitle = {BMC bioinformatics},
  keywords = {Algorithms,Arabidopsis,Arabidopsis: genetics,Base Sequence,Bayes Theorem,Gene Expression Profiling,Gene Expression Profiling: methods,Research Design,RNA; Plant,RNA; Plant: genetics},
  pages = {422},
  title = {{{baySeq}}: Empirical {{Bayesian}} Methods for Identifying Differential Expression in Sequence Count Data.},
  volume = {11}
}

@article{Harmsen2017,
  abstract = {The unique spectral signatures and biologically inert compositions of surface-enhanced resonance Raman scattering (SERRS) nanoparticles make them promising contrast agents for in vivo cancer imaging. Our SERRS nanoparticles consist of a 60-nm gold nanoparticle core that is encapsulated in a 15-nm-thick silica shell wherein the resonant Raman reporter is embedded. Subtle aspects of their preparation can shift their limit of detection by orders of magnitude. In this protocol, we present the optimized, step-by-step procedure for generating reproducible SERRS nanoparticles with femtomolar (10(-15) M) limits of detection. We provide ways of characterizing the optical properties of SERRS nanoparticles using UV/VIS and Raman spectroscopy, and their physicochemical properties using transmission electron microscopy and nanoparticle tracking analysis. We introduce several applications of these nanoprobes for biomedical research, with a focus on intraoperative cancer imaging via Raman imaging. A detailed account is provided for successful i.v. administration of SERRS nanoparticles such that delineation of cancerous lesions can be achieved in vivo and ex vivo on resected tissues without the need for specific biomarker targeting. This straightforward, yet comprehensive, protocol-from initial de novo gold nanoparticle synthesis to SERRS nanoparticle contrast-enhanced preclinical Raman imaging in animal models-takes {$\sim$}96 h.},
  author = {Harmsen, Stefan and Wall, Matthew A. and Huang, Ruimin and Kircher, Moritz F.},
  date = {2017},
  doi = {10/gbkspv},
  eprint = {28686581},
  eprinttype = {pmid},
  issn = {17502799},
  journaltitle = {Nature protocols},
  number = {7},
  pages = {1400-1414},
  title = {Cancer Imaging Using Surface-Enhanced Resonance {{Raman}} Scattering Nanoparticles},
  volume = {12}
}

@article{Harrow2012,
  abstract = {The GENCODE Consortium aims to identify all gene features in the human genome using a combination of computational analysis, manual annotation, and experimental validation. Since the first public release of this annotation data set, few new protein-coding loci have been added, yet the number of alternative splicing transcripts annotated has steadily increased. The GENCODE 7 release contains 20,687 protein-coding and 9640 long noncoding RNA loci and has 33,977 coding transcripts not represented in UCSC genes and RefSeq. It also has the most comprehensive annotation of long noncoding RNA (lncRNA) loci publicly available with the predominant transcript form consisting of two exons. We have examined the completeness of the transcript annotation and found that 35\% of transcriptional start sites are supported by CAGE clusters and 62\% of protein-coding genes have annotated polyA sites. Over one-third of GENCODE protein-coding genes are supported by peptide hits derived from mass spectrometry spectra submitted to Peptide Atlas. New models derived from the Illumina Body Map 2.0 RNA-seq data identify 3689 new loci not currently in GENCODE, of which 3127 consist of two exon models indicating that they are possibly unannotated long noncoding loci. GENCODE 7 is publicly available from gencodegenes.org and via the Ensembl and UCSC Genome Browsers. \textcopyright{} 2012, Published by Cold Spring Harbor Laboratory Press.},
  author = {Harrow, Jennifer and Frankish, Adam and Gonzalez, Jose M. and Tapanari, Electra and Diekhans, Mark and Kokocinski, Felix and Aken, Bronwen L. and Barrell, Daniel and Zadissa, Amonida and Searle, Stephen and Barnes, If and Bignell, Alexandra and Boychenko, Veronika and Hunt, Toby and Kay, Mike and Mukherjee, Gaurab and Rajan, Jeena and Despacio-Reyes, Gloria and Saunders, Gary and Steward, Charles and Harte, Rachel and Lin, Michael and Howald, C\'edric and Tanzer, Andrea and Derrien, Thomas and Chrast, Jacqueline and Walters, Nathalie and Balasubramanian, Suganthi and Pei, Baikang and Tress, Michael and Rodriguez, Jose Manuel and Ezkurdia, Iakes and Van Baren, Jeltje and Brent, Michael and Haussler, David and Kellis, Manolis and Valencia, Alfonso and Reymond, Alexandre and Gerstein, Mark and Guig\'o, Roderic and Hubbard, Tim J.},
  date = {2012},
  doi = {10/f4w5m5},
  file = {/Users/ryan/Documents/Zotero Library/Harrow et al. - 2012 - GENCODE The reference human genome annotation for.pdf},
  issn = {10889051},
  journaltitle = {Genome Research},
  number = {9},
  pages = {1760-1774},
  title = {{{GENCODE}}: {{The}} Reference Human Genome Annotation for the {{ENCODE}} Project},
  volume = {22}
}

@unpublished{Hart,
  abstract = {Application of sequencing technologies to transcript quantitation (RNA-seq) has revealed a vast transcriptome that might be fully measured only by ultradeep sequencing. We show that a human cell's transcriptome can be divided into "active" genes carrying out the work of the cell and "noisy" gene by-products. Active genes are highly expressed in their native tissues, but are often noisy genes elsewhere. Noisy genes are physically near active genes but have opposite epigenetic signatures, are not detected in proteomic studies, and are not essential genes. At moderate read depth, RNA-seq can accurately measure differential expression of active genes with high correlation to microarray studies. At this depth, 12 or more samples can be assayed in a single lane on an Illumina HiSeq platform with negligible information loss compared to deeper sequencing and with greater coverage than microarrays, enabling crucial experimental measurement of biological variation for the same sequencing resources.},
  author = {Hart, G Traver and Komori, H Kiyomi and Lamere, Sarah A and Podshivalova, Katie and Grigoriev, Yevgeniy A and Salomon, Daniel R},
  file = {/Users/ryan/Documents/Zotero Library/Hart et al. - The Noisy Human Transcriptome Implications for th.pdf},
  ids = {Hart},
  keywords = {⛔ No DOI found},
  title = {The {{Noisy Human Transcriptome}}: {{Implications}} for the {{Design}} of {{Efficient RNA}}-{{Seq Experiments}}}
}

@article{Hart2013,
  abstract = {BACKGROUND: Early application of second-generation sequencing technologies to transcript quantitation (RNA-seq) has hinted at a vast mammalian transcriptome, including transcripts from nearly all known genes, which might be fully measured only by ultradeep sequencing. Subsequent studies suggested that low-abundance transcripts might be the result of technical or biological noise rather than active transcripts; moreover, most RNA-seq experiments did not provide enough read depth to generate high-confidence estimates of gene expression for low-abundance transcripts. As a result, the community adopted several heuristics for RNA-seq analysis, most notably an arbitrary expression threshold of 0.3 - 1 FPKM for downstream analysis. However, advances in RNA-seq library preparation, sequencing technology, and informatic analysis have addressed many of the systemic sources of uncertainty and undermined the assumptions that drove the adoption of these heuristics. We provide an updated view of the accuracy and efficiency of RNA-seq experiments, using genomic data from large-scale studies like the ENCODE project to provide orthogonal information against which to validate our conclusions.

RESULTS: We show that a human cell's transcriptome can be divided into active genes carrying out the work of the cell and other genes that are likely the by-products of biological or experimental noise. We use ENCODE data on chromatin state to show that ultralow-expression genes are predominantly associated with repressed chromatin; we provide a novel normalization metric, zFPKM, that identifies the threshold between active and background gene expression; and we show that this threshold is robust to experimental and analytical variations.

CONCLUSIONS: The zFPKM normalization method accurately separates the biologically relevant genes in a cell, which are associated with active promoters, from the ultralow-expression noisy genes that have repressed promoters. A read depth of twenty to thirty million mapped reads allows high-confidence quantitation of genes expressed at this threshold, providing important guidance for the design of RNA-seq studies of gene expression. Moreover, we offer an example for using extensive ENCODE chromatin state information to validate RNA-seq analysis pipelines.},
  author = {Hart, Traver and Komori, H Kiyomi and LaMere, Sarah and Podshivalova, Katie and Salomon, Daniel R},
  date = {2013-01},
  doi = {10/gb3gbw},
  eprint = {24215113},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Hart et al. - 2013 - Finding the active genes in deep RNA-seq gene expr.pdf},
  issn = {1471-2164},
  journaltitle = {BMC genomics},
  pages = {778},
  title = {Finding the Active Genes in Deep {{RNA}}-Seq Gene Expression Studies.},
  volume = {14}
}

@article{Hashimoto2014,
  author = {Hashimoto, Tatsunori B. and Edwards, Matthew D. and Gifford, David K.},
  date = {2014-03-06},
  doi = {10/ggcxkg},
  editor = {McHardy, Alice Carolyn},
  file = {/Users/ryan/Documents/Zotero Library/Hashimoto et al. - 2014 - Universal Count Correction for High-Throughput Seq.pdf},
  issn = {1553-7358},
  journaltitle = {PLoS Computational Biology},
  number = {3},
  pages = {e1003494},
  title = {Universal {{Count Correction}} for {{High}}-{{Throughput Sequencing}}},
  volume = {10}
}

@article{Hawkins2013,
  abstract = {Naive CD4{$^+$} T cells can differentiate into specific helper and regulatory T cell lineages in order to combat infection and disease. The correct response to cytokines and a controlled balance of these populations is critical for the immune system and the avoidance of autoimmune disorders. To investigate how early cell-fate commitment is regulated, we generated the first human genome-wide maps of histone modifications that reveal enhancer elements after 72 hr of in vitro polarization toward T helper 1 (Th1) and T helper 2 (Th2) cell lineages. Our analysis indicated that even at this very early time point, cell-specific gene regulation and enhancers were at work directing lineage commitment. Further examination of lineage-specific enhancers identified transcription factors (TFs) with known and unknown T cell roles as putative drivers of lineage-specific gene expression. Lastly, an integrative analysis of immunopathogenic-associated SNPs suggests a role for distal regulatory elements in disease etiology.},
  author = {Hawkins, R David and Larjo, Antti and Tripathi, Subhash K and Wagner, Ulrich and Luu, Ying and L\"onnberg, Tapio and Raghav, Sunil K and Lee, Leonard K and Lund, Riikka and Ren, Bing and L\"ahdesm\"aki, Harri and Lahesmaa, Riitta},
  date = {2013-06-27},
  doi = {10/f49h3b},
  eprint = {23791644},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Hawkins et al. - 2013 - Global chromatin state analysis reveals lineage-sp.pdf},
  issn = {1097-4180},
  journaltitle = {Immunity},
  keywords = {Cell Differentiation,Cell Differentiation: genetics,Cell Lineage,Cell Lineage: genetics,Chromatin,Chromatin: metabolism,Genetic Predisposition to Disease,Genome-Wide Association Study,Histones,Histones: genetics,Histones: metabolism,Humans,Immune System Diseases,Immune System Diseases: genetics,Immune System Diseases: immunology,Polymorphism; Single Nucleotide,Promoter Regions; Genetic,Th1 Cells,Th1 Cells: immunology,Th1-Th2 Balance,Th2 Cells,Th2 Cells: immunology},
  number = {6},
  pages = {1271-84},
  title = {Global Chromatin State Analysis Reveals Lineage-Specific Enhancers during the Initiation of Human {{T}} Helper 1 and {{T}} Helper 2 Cell Polarization.},
  volume = {38}
}

@article{Head2014,
  abstract = {High-throughput sequencing, also known as next-generation sequencing (NGS), has revolutionized genomic research. In recent years, NGS technology has steadily improved, with costs dropping and the number and range of sequencing applications increasing exponentially. Here, we examine the critical role of sequencing library quality and consider important challenges when preparing NGS libraries from DNA and RNA sources. Factors such as the quantity and physical characteristics of the RNA or DNA source material as well as the desired application (i.e., genome sequencing, targeted sequencing, RNA-seq, ChIP-seq, RIP-seq, and methylation) are addressed in the context of preparing high quality sequencing libraries. In addition, the current methods for preparing NGS libraries from single cells are also discussed.},
  author = {Head, Steven R and Komori, H Kiyomi and a Lamere, Sarah and Whisenant, Thomas and Van Nieuwerburgh, Filip and Salomon, Daniel R and Ordoukhanian, Phillip},
  date = {2014-01},
  doi = {10/gfpqg7},
  eprint = {24502796},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Head et al. - 2014 - Library construction for next-generation sequencin.pdf},
  issn = {1940-9818},
  journaltitle = {BioTechniques},
  keywords = {chip-seq,deep sequencing,desired insert size,desired library size is,determined by the,dna,dna-seq,library preparation,next-generation sequencing,portion between the adapter,referring to the library,rip-seq,rna,rna-seq,sequences},
  number = {2},
  pages = {61-77},
  title = {Library Construction for Next-Generation Sequencing: {{Overviews}} and Challenges.},
  volume = {56}
}

@article{Heerboth2014,
  abstract = {Epigenetic changes such as DNA methylation and histone methylation and acetylation alter gene expression at the level of transcription by upregulating, downregulating, or silencing genes completely. Dysregulation of epigenetic events can be pathological, leading to cardiovascular disease, neurological disorders, metabolic disorders, and cancer development. Therefore, identifying drugs that inhibit these epigenetic changes are of great clinical interest. In this review, we summarize the epigenetic events associated with different disorders and diseases including cardiovascular, neurological, and metabolic disorders, and cancer. Knowledge of the specific epigenetic changes associated with these types of diseases facilitates the development of specific inhibitors, which can be used as epigenetic drugs. In this review, we discuss the major classes of epigenetic drugs currently in use, such as DNA methylation inhibiting drugs, bromodomain inhibitors, histone acetyl transferase inhibitors, histone deacetylase inhibitors, protein methyltransferase inhibitors, and histone methylation inhibitors and their role in reversing epigenetic changes and treating disease. \textcopyright{} The authors, publisher and licensee Libertas Academica Limited.},
  author = {Heerboth, Sarah and Lapinska, Karolina and Snyder, Nicole and Leary, Meghan and Rollinson, Sarah and Sarkar, Sibaji},
  date = {2014-01-27},
  doi = {10/ggcxkj},
  file = {/Users/ryan/Documents/Zotero Library/Heerboth et al. - 2014 - Use of Epigenetic Drugs in Disease An Overview.pdf},
  issn = {1179-237X},
  journaltitle = {Genetics \& Epigenetics},
  keywords = {Cancer,Cardiovascular,DNA methylation,Drugs,Epigenetics,Gene expression,Gene silencing,Histone acetylation,Histone methylation,Metabolic,Neurological,Tumor suppressor genes},
  number = {6},
  pages = {GEG.S12270},
  title = {Use of {{Epigenetic Drugs}} in {{Disease}}: {{An Overview}}},
  volume = {6}
}

@article{Hesterberg2014,
  abstract = {I have three goals in this article: (1) To show the enormous potential of bootstrapping and permutation tests to help students understand statistical concepts including sampling distributions, standard errors, bias, confidence intervals, null distributions, and P-values. (2) To dig deeper, understand why these methods work and when they don't, things to watch out for, and how to deal with these issues when teaching. (3) To change statistical practice---by comparing these methods to common t tests and intervals, we see how inaccurate the latter are; we confirm this with asymptotics. n {$>$}= 30 isn't enough---think n {$>$}= 5000. Resampling provides diagnostics, and more accurate alternatives. Sadly, the common bootstrap percentile interval badly under-covers in small samples; there are better alternatives. The tone is informal, with a few stories and jokes.},
  archivePrefix = {arXiv},
  author = {Hesterberg, Tim},
  date = {2014},
  doi = {10/gd85v5},
  eprint = {1411.5279},
  eprinttype = {arxiv},
  file = {/Users/ryan/Documents/Zotero Library/Hesterberg - 2014 - What Teachers Should Know about the Bootstrap Res.pdf},
  issn = {0003-1305},
  issue = {July 2016},
  journaltitle = {arXiv preprint},
  keywords = {★,bootstrap,permutation test,randomization test,teaching},
  pages = {83},
  title = {What {{Teachers Should Know}} about the {{Bootstrap}}: {{Resampling}} in the {{Undergraduate Statistics Curriculum}}},
  volume = {1305}
}

@article{Hicks2016,
  author = {Hicks, Stephanie C and Okrah, Kwame and Paulson, Joseph N and Quackenbush, John and Irizarry, Rafael A and Corrada Bravo, Hector},
  date = {2016},
  doi = {10/ggcxkk},
  file = {/Users/ryan/Documents/Zotero Library/Hicks et al. - 2016 - Smooth Quantile Normalization.pdf},
  journaltitle = {bioRxiv},
  title = {Smooth {{Quantile Normalization}}}
}

@article{Hoang2011,
  abstract = {The advent of ChIP-seq technology has made the investigation of epigenetic regulatory networks a computationally tractable problem. Several groups have applied statistical computing methods to ChIP-seq datasets to gain insight into the epigenetic regulation of transcription. However, methods for estimating enrichment levels in ChIP-seq data for these computational studies are understudied and variable. Since the conclusions drawn from these data mining and machine learning applications strongly depend on the enrichment level inputs, a comparison of estimation methods with respect to the performance of statistical models should be made.},
  author = {a Hoang, Stephen and Xu, Xiaojiang and Bekiranov, Stefan},
  date = {2011-01},
  doi = {10/d6d96w},
  eprint = {21834981},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Hoang et al. - 2011 - Quantification of histone modification ChIP-seq en.pdf},
  issn = {1756-0500},
  journaltitle = {BMC research notes},
  number = {1},
  pages = {288},
  title = {Quantification of Histone Modification {{ChIP}}-Seq Enrichment for Data Mining and Machine Learning Applications.},
  volume = {4}
}

@article{Hoffman2009,
  abstract = {The transcriptional networks underlying mammalian cell development and function are largely unknown. The recently described use of flow cell sequencing devices in combination with chromatin immunoprecipitation (ChIP-seq) stands to revolutionize the identification of DNA-protein interactions. As such, ChIP-seq is rapidly becoming the method of choice for the genome-wide localization of histone modifications and transcription factor binding sites. As further studies are performed, the information generated by ChIP-seq is expected to allow the development of a framework for networks describing the transcriptional regulation of cellular development and function. However, to date, this technology has been applied only to a small number of cell types, and even fewer tissues, suggesting a huge potential for novel discovery in this field.},
  author = {Hoffman, Brad G and Jones, Steven J M},
  date = {2009-04},
  doi = {10/d2tc7z},
  eprint = {19136617},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Hoffman and Jones - 2009 - Genome-wide identification of DNA-protein interact.pdf},
  issn = {1479-6805},
  journaltitle = {The Journal of endocrinology},
  keywords = {Animals,Binding Sites,Binding Sites: genetics,Chromatin Immunoprecipitation,Chromatin Immunoprecipitation: methods,Chromosome Mapping,Chromosome Mapping: methods,DNA-Binding Proteins,DNA-Binding Proteins: metabolism,Humans,Models; Biological,Protein Binding,Sequence Analysis; DNA,Sequence Analysis; DNA: methods},
  number = {1},
  pages = {1-13},
  title = {Genome-Wide Identification of {{DNA}}-Protein Interactions Using Chromatin Immunoprecipitation Coupled with Flow Cell Sequencing.},
  volume = {201}
}

@article{Hoffman2016,
  abstract = {Gene expression datasets are complicated and have multiple sources of biological and technical variation. These datasets have recently become more complex as it is now feasible to assay gene expression from the same individual in multiple tissues or at multiple time points. The variancePar-tition package implements a statistical method to quantify the contribution of multiple sources of variation and decouple within/between-individual variation. In addition, variancePartition pro-duces results at the gene-level to identity genes that follow or deviate from the genome-wide trend.},
  author = {Hoffman, Gabriel},
  date = {2016},
  doi = {10/ggcxkh},
  file = {/Users/ryan/Documents/Zotero Library/Hoffman - 2016 - variancePartition Quantifying and interpreting dr.pdf},
  issn = {1471-2105},
  journaltitle = {BMC Bioinformatics},
  keywords = {linear mixed model,rna-seq,transcriptome profiling,Transcriptome profiling;RNA-seq;Linear mixed model},
  pages = {10-12},
  title = {{{variancePartition}}: {{Quantifying}} and Interpreting Drivers of Variation in Multilevel Gene Expression Experiments}
}

@article{Holik2016,
  abstract = {Carefully designed control experiments provide a gold standard for benchmarking different genomics research tools. A shortcoming of many gene expression control studies is that replication involves profiling the same reference RNA sample multiple times. This leads to low, pure technical noise that is atypical of regular studies. To achieve a more realistic noise structure, we generated a RNA-sequencing mixture experiment using two cell lines of the same cancer type. Variability was added by extracting RNA from independent cell cultures and degrading particular samples. The systematic gene expression changes induced by this design allowed benchmarking of different library preparation kits (standard poly-A versus total RNA with Ribozero depletion) and analysis pipelines. Data generated using the total RNA kit had more signal for introns and various RNA classes (ncRNA, snRNA, snoRNA) and less variability after degradation. For differential expression analysis, voom with quality weights marginally outperformed other popular methods, while for differential splicing, DEXSeq was simultaneously the most sensitive and the most inconsistent method. For sample deconvolution analysis, DeMix outperformed IsoPure convincingly. Our RNA-sequencing data set provides a valuable resource for benchmarking different protocols and data pre-processing workflows. The extra noise mimics routine lab experiments more closely, ensuring any conclusions are widely applicable.},
  archivePrefix = {arXiv},
  author = {Holik, Aliaksei Z. and Law, Charity W. and Liu, Ruijie and Wang, Zeya and Wang, Wenyi and Ahn, Jaeil and Asselin-Labat, Marie-Liesse and Smyth, Gordon K. and Ritchie, Matthew E.},
  date = {2017-03-17},
  doi = {10/f9x5nx},
  eprint = {27899618},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Holik et al. - 2017 - RNA-seq mixology designing realistic control expe.pdf},
  isbn = {2076792171},
  issn = {0305-1048},
  journaltitle = {Nucleic Acids Research},
  keywords = {★},
  number = {5},
  pages = {e30-e30},
  title = {{{RNA}}-Seq Mixology: Designing Realistic Control Experiments to Compare Protocols and Analysis Methods},
  volume = {45}
}

@article{Holt2011,
  abstract = {Second-generation sequencing technologies are precipitating major shifts with regards to what kinds of genomes are being sequenced and how they are annotated. While the first generation of genome projects focused on well-studied model organisms, many of today's projects involve exotic organisms whose genomes are largely terra incognita. This complicates their annotation, because unlike first-generation projects, there are no pre-existing 'gold-standard' gene-models with which to train gene-finders. Improvements in genome assembly and the wide availability of mRNA-seq data are also creating opportunities to update and re-annotate previously published genome annotations. Today's genome projects are thus in need of new genome annotation tools that can meet the challenges and opportunities presented by second-generation sequencing technologies.},
  author = {Holt, Carson and Yandell, Mark},
  date = {2011-01},
  doi = {10/fz39nj},
  eprint = {22192575},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Holt and Yandell - 2011 - MAKER2 an annotation pipeline and genome-database.pdf},
  issn = {1471-2105},
  journaltitle = {BMC bioinformatics},
  keywords = {Animals,Databases; Genetic,Genome,High-Throughput Nucleotide Sequencing,High-Throughput Nucleotide Sequencing: methods,Humans,Molecular Sequence Annotation,Plants,Plants: genetics,Software},
  number = {1},
  pages = {491},
  title = {{{MAKER2}}: An Annotation Pipeline and Genome-Database Management Tool for Second-Generation Genome Projects.},
  volume = {12}
}

@article{Houseman2012,
  abstract = {BACKGROUND: There has been a long-standing need in biomedical research for a method that quantifies the normally mixed composition of leukocytes beyond what is possible by simple histological or flow cytometric assessments. The latter is restricted by the labile nature of protein epitopes, requirements for cell processing, and timely cell analysis. In a diverse array of diseases and following numerous immune-toxic exposures, leukocyte composition will critically inform the underlying immuno-biology to most chronic medical conditions. Emerging research demonstrates that DNA methylation is responsible for cellular differentiation, and when measured in whole peripheral blood, serves to distinguish cancer cases from controls.\textbackslash{}n\textbackslash{}nRESULTS: Here we present a method, similar to regression calibration, for inferring changes in the distribution of white blood cells between different subpopulations (e.g. cases and controls) using DNA methylation signatures, in combination with a previously obtained external validation set consisting of signatures from purified leukocyte samples. We validate the fundamental idea in a cell mixture reconstruction experiment, then demonstrate our method on DNA methylation data sets from several studies, including data from a Head and Neck Squamous Cell Carcinoma (HNSCC) study and an ovarian cancer study. Our method produces results consistent with prior biological findings, thereby validating the approach.\textbackslash{}n\textbackslash{}nCONCLUSIONS: Our method, in combination with an appropriate external validation set, promises new opportunities for large-scale immunological studies of both disease states and noxious exposures.},
  author = {Houseman, Eugene Andres and Accomando, William P and Koestler, Devin C and Christensen, Brock C and Marsit, Carmen J and Nelson, Heather H and Wiencke, John K and Kelsey, Karl T},
  date = {2012},
  doi = {10/gb8vmz},
  eprint = {22568884},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Houseman et al. - 2012 - DNA methylation arrays as surrogate measures of ce.pdf},
  isbn = {1471-2105 (Electronic)\textbackslash{}r1471-2105 (Linking)},
  issn = {1471-2105},
  journaltitle = {BMC bioinformatics},
  keywords = {Computer Simulation,Data Interpretation; Statistical,DNA Methylation,Down Syndrome,Down Syndrome: blood,Down Syndrome: diagnosis,Down Syndrome: immunology,Epigenesis; Genetic,Female,Gene Expression Profiling,Head and Neck Neoplasms,Head and Neck Neoplasms: blood,Head and Neck Neoplasms: diagnosis,Head and Neck Neoplasms: immunology,Humans,Leukocyte Count,Leukocyte Count: methods,Leukocytes,Leukocytes: immunology,Obesity,Obesity: blood,Obesity: genetics,Obesity: immunology,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: statistic,Ovarian Neoplasms,Ovarian Neoplasms: blood,Ovarian Neoplasms: diagnosis,Ovarian Neoplasms: immunology},
  number = {1},
  pages = {86},
  title = {{{DNA}} Methylation Arrays as Surrogate Measures of Cell Mixture Distribution.},
  volume = {13}
}

@article{Houseman2014b,
  abstract = {MOTIVATION: Recently there has been increasing interest in the effects of cell mixture on the measurement of DNA methylation, specifically the extent to which small perturbations in cell mixture proportions can register as changes in DNA methylation. A recently published set of statistical methods exploits this association to infer changes in cell mixture proportions, and these methods are presently being applied to adjust for cell mixture effect in the context of epigenome-wide association studies. However, these adjustments require the existence of reference datasets, which may be laborious or expensive to collect. For some tissues such as placenta, saliva, adipose or tumor tissue, the relevant underlying cell types may not be known.\textbackslash{}n\textbackslash{}nRESULTS: We propose a method for conducting epigenome-wide association studies analysis when a reference dataset is unavailable, including a bootstrap method for estimating standard errors. We demonstrate via simulation study and several real data analyses that our proposed method can perform as well as or better than methods that make explicit use of reference datasets. In particular, it may adjust for detailed cell type differences that may be unavailable even in existing reference datasets.\textbackslash{}n\textbackslash{}nAVAILABILITY AND IMPLEMENTATION: Software is available in the R package RefFreeEWAS. Data for three of four examples were obtained from Gene Expression Omnibus (GEO), accession numbers GSE37008, GSE42861 and GSE30601, while reference data were obtained from GEO accession number GSE39981.\textbackslash{}n\textbackslash{}nCONTACT: andres.houseman@oregonstate.edu\textbackslash{}n\textbackslash{}nSUPPLEMENTARY INFORMATION: Supplementary data are available at Bioinformatics online.},
  author = {Houseman, Eugene Andres and Molitor, John and Marsit, Carmen J.},
  date = {2014-05-15},
  doi = {10/f5488h},
  eprint = {24451622},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Houseman et al. - 2014 - Reference-free cell mixture adjustments in analysi.pdf;/Users/ryan/Documents/Zotero Library/Houseman et al. - 2014 - Reference-free cell mixture adjustments in analysi2.pdf},
  isbn = {1367-4811 (Electronic)\textbackslash{}r1367-4803 (Linking)},
  issn = {1460-2059},
  journaltitle = {Bioinformatics},
  number = {10},
  pages = {1431-1439},
  title = {Reference-Free Cell Mixture Adjustments in Analysis of {{DNA}} Methylation Data},
  volume = {30}
}

@incollection{Houseman2015,
  abstract = {The present study responds to the poor treatment given to dilatancy in classical rock mechanics post-failure problems such as tunnel or mine pillar design. A comprehensive review of the literature and observations in regard to published test results would indicate that dilatancy is highly dependent both on the plasticity already experienced by the material and confining stress; moreover, it also appears that scale may play a non-negligible role. In our article, we provide a detailed analysis of published test data with a view to proposing a sufficiently significant but conveniently simple formulation of the dilatancyangle that reflects these dependencies and that can be readily implemented in numerical codes. The model is then tested, demonstrating that it is capable of representing rock sample strain behaviour in compressive tests. Finally, the model is applied to the resolution of ground reaction curves for tunnels in poor-to-average-quality rock masses, showing a good correlation with results obtained using practical rock engineering techniques.},
  author = {Houseman, E. Andr\'es},
  booktitle = {Computational and {{Statistical Epigenomics}}},
  date = {2015},
  doi = {10/dd8x},
  editor = {Teschendorff, Andrew E.},
  isbn = {978-94-017-9926-3},
  keywords = {\#nosource,Cell composition,Confounding,DMP,DMR,Immune,Mediation},
  pages = {35-50},
  publisher = {{Springer Netherlands}},
  title = {{{DNA Methylation}} and {{Cell}}-{{Type Distribution}}},
  url = {http://link.springer.com/10.1007/978-94-017-9927-0_2},
  volume = {7}
}

@article{Houseman2015a,
  author = {Houseman, E Andr\'es},
  date = {2015},
  doi = {10/ggcxkm},
  isbn = {978-94-017-9926-3},
  keywords = {\#nosource,cell composition,confounding,dmp,dmr,immune},
  title = {Computational and {{Statistical Epigenomics}}},
  volume = {7}
}

@article{Hu2013,
  abstract = {The RNA transcriptome varies in response to cellular differentiation as well as environmental factors, and can be characterized by the diversity and abundance of transcript isoforms. Differential transcription analysis, the detection of differences between the transcriptomes of different cells, may improve understanding of cell differentiation and development and enable the identification of biomarkers that classify disease types. The availability of high-throughput short-read RNA sequencing technologies provides in-depth sampling of the transcriptome, making it possible to accurately detect the differences between transcriptomes. In this article, we present a new method for the detection and visualization of differential transcription. Our approach does not depend on transcript or gene annotations. It also circumvents the need for full transcript inference and quantification, which is a challenging problem because of short read lengths, as well as various sampling biases. Instead, our method takes a divide-and-conquer approach to localize the difference between transcriptomes in the form of alternative splicing modules (ASMs), where transcript isoforms diverge. Our approach starts with the identification of ASMs from the splice graph, constructed directly from the exons and introns predicted from RNA-seq read alignments. The abundance of alternative splicing isoforms residing in each ASM is estimated for each sample and is compared across sample groups. A non-parametric statistical test is applied to each ASM to detect significant differential transcription with a controlled false discovery rate. The sensitivity and specificity of the method have been assessed using simulated data sets and compared with other state-of-the-art approaches. Experimental validation using qRT-PCR confirmed a selected set of genes that are differentially expressed in a lung differentiation study and a breast cancer data set, demonstrating the utility of the approach applied on experimental biological data sets. The software of DiffSplice is available at http://www.netlab.uky.edu/p/bioinfo/DiffSplice.},
  author = {Hu, Yin and Huang, Yan and Du, Ying and Orellana, Christian F and Singh, Darshan and Johnson, Amy R and Monroy, Ana\"is and Kuan, Pei-Fen and Hammond, Scott M and Makowski, Liza and Randell, Scott H and Chiang, Derek Y and Hayes, D Neil and Jones, Corbin and Liu, Yufeng and Prins, Jan F and Liu, Jinze},
  date = {2013-01-01},
  doi = {10/f4ms57},
  eprint = {23155066},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Hu et al. - 2013 - DiffSplice the genome-wide detection of different.pdf},
  issn = {1362-4962},
  journaltitle = {Nucleic acids research},
  number = {2},
  pages = {e39},
  title = {{{DiffSplice}}: The Genome-Wide Detection of Differential Splicing Events with {{RNA}}-Seq.},
  volume = {41}
}

@article{Huang2012,
  abstract = {The circadian clock in mammals is driven by an autoregulatory transcriptional feedback mechanism that takes approximately 24 hours to complete. A key component of this mechanism is a heterodimeric transcriptional activator consisting of two basic helix-loop-helix PER-ARNT-SIM (bHLH-PAS) domain protein subunits, CLOCK and BMAL1. Here, we report the crystal structure of a complex containing the mouse CLOCK:BMAL1 bHLH-PAS domains at 2.3 \AA{} resolution. The structure reveals an unusual asymmetric heterodimer with the three domains in each of the two subunits--bHLH, PAS-A, and PAS-B--tightly intertwined and involved in dimerization interactions, resulting in three distinct protein interfaces. Mutations that perturb the observed heterodimer interfaces affect the stability and activity of the CLOCK:BMAL1 complex as well as the periodicity of the circadian oscillator. The structure of the CLOCK:BMAL1 complex is a starting point for understanding at an atomic level the mechanism driving the mammalian circadian clock.},
  author = {Huang, Nian and Chelliah, Yogarany and Shan, Yongli and a Taylor, Clinton and Yoo, Seung-Hee and Partch, Carrie and Green, Carla B and Zhang, Hong and Takahashi, Joseph S},
  date = {2012-07-13},
  doi = {10/h3m},
  eprint = {22653727},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Huang et al. - 2012 - Crystal structure of the heterodimeric CLOCKBMAL1.pdf},
  issn = {1095-9203},
  journaltitle = {Science (New York, N.Y.)},
  keywords = {Amino Acid Sequence,Animals,ARNTL Transcription Factors,ARNTL Transcription Factors: chemistry,ARNTL Transcription Factors: genetics,ARNTL Transcription Factors: metabolism,Cells; Cultured,Circadian Rhythm,CLOCK Proteins,CLOCK Proteins: chemistry,CLOCK Proteins: genetics,CLOCK Proteins: metabolism,Crystallography; X-Ray,DNA,DNA: metabolism,HEK293 Cells,Helix-Loop-Helix Motifs,Humans,Mice,Models; Molecular,Molecular Sequence Data,Mutant Proteins,Mutant Proteins: chemistry,Mutant Proteins: metabolism,Protein Binding,Protein Interaction Domains and Motifs,Protein Multimerization,Protein Structure; Quaternary,Protein Structure; Secondary,Protein Structure; Tertiary,Protein Subunits,Protein Subunits: chemistry,Protein Subunits: metabolism,Static Electricity,Transcriptional Activation},
  number = {6091},
  pages = {189-94},
  title = {Crystal Structure of the Heterodimeric {{CLOCK}}:{{BMAL1}} Transcriptional Activator Complex.},
  volume = {337}
}

@article{Huber2002,
  author = {Huber, W. and von Heydebreck, A. and Sultmann, H. and Poustka, Annemarie and Vingron, Martin},
  date = {2002-07-01},
  doi = {10/dbb6xx},
  file = {/Users/ryan/Documents/Zotero Library/Huber et al. - 2002 - Variance stabilization applied to microarray data .pdf},
  issn = {1367-4803},
  issue = {Suppl 1},
  journaltitle = {Bioinformatics},
  options = {useprefix=true},
  pages = {S96-S104},
  title = {Variance Stabilization Applied to Microarray Data Calibration and to the Quantification of Differential Expression},
  volume = {18}
}

@article{Huber2015,
  author = {Huber, Wolfgang and Carey, Vincent J and Gentleman, Robert and Anders, Simon and Carlson, Marc and Carvalho, Benilton S and Bravo, Hector Corrada and Davis, Sean and Gatto, Laurent and Girke, Thomas and Gottardo, Raphael and Hahne, Florian and Hansen, Kasper D and a Irizarry, Rafael and Lawrence, Michael and Love, Michael I and Macdonald, James and Obenchain, Valerie and Ole\'s, Andrzej K and Pag\`es, Herv\'e and Reyes, Alejandro and Shannon, Paul and Smyth, Gordon K and Tenenbaum, Dan and Waldron, Levi and Morgan, Martin},
  date = {2015},
  doi = {10/bb35},
  file = {/Users/ryan/Documents/Zotero Library/Huber et al. - 2015 - Orchestrating high-throughput genomic analysis wit.pdf},
  issn = {1548-7091},
  journaltitle = {Nature Publishing Group},
  number = {2},
  pages = {115-121},
  title = {Orchestrating High-Throughput Genomic Analysis with {{Bioconductor}}},
  volume = {12}
}

@article{Hull2013,
  abstract = {BACKGROUND: The oxidative burst is one of the major antimicrobial mechanisms adopted by macrophages. The WKY rat strain is uniquely susceptible to experimentally induced macrophage-dependent crescentic glomerulonephritis (Crgn). We previously identified the AP-1 transcription factor JunD as a determinant of macrophage activation in WKY bone marrow-derived macrophages (BMDMs). JunD is over-expressed in WKY BMDMs and its silencing reduces Fc receptor-mediated oxidative burst in these cells.\textbackslash{}n\textbackslash{}nRESULTS: Here we combined Jund RNA interference with microarray analyses alongside ChIP-sequencing (ChIP-Seq) analyses in WKY BMDMs to investigate JunD-mediated control of macrophage activation in basal and lipopolysaccharide (LPS) stimulated cells. Microarray analysis following Jund silencing showed that Jund activates and represses gene expression with marked differential expression ({$>$}3 fold) for genes linked with oxidative stress and IL-1{$\beta$} expression. These results were complemented by comparing whole genome expression in WKY BMDMs with Jund congenic strain (WKY.LCrgn2) BMDMs which express lower levels of JunD. ChIP-Seq analyses demonstrated that the increased expression of JunD resulted in an increased number of binding events in WKY BMDMs compared to WKY.LCrgn2 BMDMs. Combined ChIP-Seq and microarray analysis revealed a set of primary JunD-targets through which JunD exerts its effect on oxidative stress and IL-1{$\beta$} synthesis in basal and LPS-stimulated macrophages.\textbackslash{}n\textbackslash{}nCONCLUSIONS: These findings demonstrate how genetically determined levels of a transcription factor affect its binding sites in primary cells and identify JunD as a key regulator of oxidative stress and IL-1{$\beta$} synthesis in primary macrophages, which may play a role in susceptibility to Crgn.},
  author = {Hull, Richard P. and Srivastava, Prashant K. and D'Souza, Zelpha and Atanur, Santosh S. and Mechta-Grigoriou, Fatima and Game, Laurence and Petretto, Enrico and Cook, H. Terence and Aitman, Timothy J. and Behmoaras, Jacques},
  date = {2013},
  doi = {10/gb3f26},
  eprint = {23398888},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Hull et al. - 2013 - Combined ChIP-Seq and transcriptome analysis ident.pdf},
  isbn = {1471-2164 (Electronic)\textbackslash{}n1471-2164 (Linking)},
  issn = {14712164},
  journaltitle = {BMC Genomics},
  number = {1},
  title = {Combined {{ChIP}}-{{Seq}} and Transcriptome Analysis Identifies {{AP}}-1/{{JunD}} as a Primary Regulator of Oxidative Stress and {{IL}}-1{$\beta$} Synthesis in Macrophages},
  volume = {14}
}

@article{Hummel2008,
  abstract = {MOTIVATION: Several authors have studied expression in gene sets with specific goals: overrepresentation of interesting genes in functional groups, predictive power for class membership and searches for groups where the constituent genes show coordinated changes in expression under the experimental conditions. The purpose of this article is to follow the third direction. One important aspect is that the gene sets under analysis are known a priori and are not determined from the experimental data at hand. Our goal is to provide a methodology that helps to identify the relevant structural constituents (phenotypical, experimental design, biological component) that determine gene expression in a group.

RESULTS: Gene-wise linear models are used to formalize the structural aspects of a study. The full model is contrasted with a reduced model that lacks the relevant design component. A comparison with respect to goodness of fit is made and quantified. An asymptotic test and a permutation test are derived to test the null hypothesis that the reduced model sufficiently explains the observed expression within the gene group of interest. Graphical tools are available to illustrate and interpret the results of the analysis. Examples demonstrate the wide range of application.

AVAILABILITY: The R-package GlobalAncova (http://www.bioconductor.org) offers data and functions as well as a vignette to guide the user through specific analysis steps.},
  author = {Hummel, Manuela and Meister, Reinhard and Mansmann, Ulrich},
  date = {2008-01-01},
  doi = {10/fn3tkw},
  eprint = {18024976},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Hummel et al. - 2008 - GlobalANCOVA exploration and assessment of gene g.pdf},
  issn = {1367-4811},
  journaltitle = {Bioinformatics (Oxford, England)},
  keywords = {Algorithms,Computer Simulation,Gene Expression Profiling,Gene Expression Profiling: methods,Models; Biological,Multigene Family,Multigene Family: physiology,Proteome,Proteome: metabolism,Signal Transduction,Signal Transduction: physiology,Software,User-Computer Interface},
  number = {1},
  pages = {78-85},
  title = {{{GlobalANCOVA}}: Exploration and Assessment of Gene Group Effects.},
  volume = {24}
}

@misc{Humphreys2011,
  author = {Humphreys, David T and Muthiah, Kavitha and Thomas, Liza and Macdonald, Peter and Hayward, Chris},
  date = {2011},
  file = {/Users/ryan/Documents/Zotero Library/Humphreys et al. - 2011 - Assessment of cardiac microRNA high throughput seq.pdf},
  title = {Assessment of Cardiac {{microRNA}} High Throughput Sequencing Data Sets Generated from {{RNA}} of Varying Quality.},
  url = {http://www.slideshare.net/AustralianBioinformatics/assessment-of-cardiac-mi-rna-hts-data-sets-david-t-humphreys}
}

@article{Hussey2017,
  abstract = {Despite the considerable contribution of xylem development (xylogenesis) to plant biomass accumulation, its epigenetic regulation is poorly understood. Furthermore, the relative contributions of histone modifications to transcriptional regulation is not well studied in plants. We investigated the biological relevance of H3K4me3 and H3K27me3 in secondary xylem development using ChIP-seq and their association with transcript levels among other histone modifications in woody and herbaceous models. In developing secondary xylem of the woody model Eucalyptus grandis, H3K4me3 and H3K27me3 genomic spans were distinctly associated with xylogenesis-related processes, with (late) lignification pathways enriched for putative bivalent domains, but not early secondary cell wall polysaccharide deposition. H3K27me3-occupied genes, of which 753 (\textasciitilde{}31\%) are novel targets, were enriched for transcriptional regulation and flower development and had significant preferential expression in roots. Linear regression models of the ChIP-seq profiles predicted \textasciitilde{}50\% of transcript abundance measured with strand-specific RNA-seq, confirmed in a parallel analysis in Arabidopsis where integration of seven additional histone modifications each contributed smaller proportions of unique information to the predictive models. This study uncovers the biological importance of histone modification antagonism and genomic span in xylogenesis and quantifies for the first time the relative correlations of histone modifications with transcript abundance in plants.},
  author = {Hussey, Steven G. and Loots, Mattheus T. and Van Der Merwe, Karen and Mizrachi, Eshchar and Myburg, Alexander A.},
  date = {2017},
  doi = {10/gbh55d},
  eprint = {28611454},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Hussey et al. - 2017 - Integrated analysis and transcript abundance model.pdf},
  issn = {20452322},
  journaltitle = {Scientific Reports},
  number = {1},
  pages = {1-14},
  title = {Integrated Analysis and Transcript Abundance Modelling of {{H3K4me3}} and {{H3K27me3}} in Developing Secondary Xylem},
  volume = {7}
}

@article{Ignatiadis2015,
  author = {Ignatiadis, Nikolaos and Klaus, Bernd and Zaugg, Judith and Huber, Wolfgang},
  date = {2015},
  doi = {10/ggcxkn},
  file = {/Users/ryan/Documents/Zotero Library/Ignatiadis et al. - 2015 - Data-driven hypothesis weighting increases detecti.pdf},
  title = {Data-Driven Hypothesis Weighting Increases Detection Power in Big Data Analytics}
}

@article{Ignatiadis2016,
  abstract = {Hypothesis weighting improves the power of large-scale multiple testing. We describe independent hypothesis weighting (IHW), a method that assigns weights using covariates independent of the P-values under the null hypothesis but informative of each test's power or prior probability of the null hypothesis (http://www.bioconductor.org/packages/IHW). IHW increases power while controlling the false discovery rate and is a practical approach to discovering associations in genomics, high-throughput biology and other large data sets.},
  author = {Ignatiadis, Nikolaos and Klaus, Bernd and Zaugg, Judith B. and Huber, Wolfgang},
  date = {2016-07-30},
  doi = {10/gf92hz},
  eprint = {27240256},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Ignatiadis et al. - 2016 - Data-driven hypothesis weighting increases detecti.pdf},
  isbn = {6174322262},
  issn = {1548-7091},
  journaltitle = {Nature Methods},
  number = {7},
  pages = {577-580},
  title = {Data-Driven Hypothesis Weighting Increases Detection Power in Genome-Scale Multiple Testing},
  volume = {13}
}

@article{Ignatiadis2017,
  abstract = {A fundamental task in the analysis of datasets with many variables is screening for associations. This can be cast as a multiple testing task, where the major challenge is achieving high detection power while controlling type I error. We consider \$m\$ hypothesis tests represented by pairs \$((P\_i, X\_i))\_\{1\textbackslash{}leq i \textbackslash{}leq m\}\$ of p-values \$P\_i\$ and covariates \$X\_i\$, such that \$P\_i \textbackslash{}perp X\_i\$ under the null hypothesis. Here, we show how to use information potentially available in the covariates about heterogeneities among hypotheses to increase power compared to conventional procedures that only use the \$P\_i\$. To this end, we upgrade existing weighted multiple testing procedures through the Independent Hypothesis Weighting (IHW) framework to use data-driven weights which are a function of the covariate \$X\_i\$. Finite sample guarantees, e.g. false discovery rate (FDR) control, are derived from cross-weighting, a novel data-splitting approach that enables learning the weight-covariate function without overfitting as long as the hypotheses can be partitioned into independent folds, with arbitrary within-fold dependence. We show how the increased power of IHW can be understood in terms of the conditional two-groups model. A key implication of IHW is that hypothesis rejection in many common multiple testing setups should not proceed according to the ranking of the p-values, but by an alternative ranking implied by the covariate-weighted p-values.},
  archivePrefix = {arXiv},
  author = {Ignatiadis, Nikolaos and Huber, Wolfgang},
  date = {2017-01-18},
  eprint = {1701.05179},
  eprinttype = {arxiv},
  file = {/Users/ryan/Documents/Zotero Library/Ignatiadis and Huber - 2017 - Covariate powered cross-weighted multiple testing.pdf},
  keywords = {()},
  number = {0000},
  pages = {0-23},
  title = {Covariate Powered Cross-Weighted Multiple Testing},
  url = {http://arxiv.org/abs/1701.05179},
  volume = {0}
}

@article{ihakaLanguageDataAnalysis1996,
  abstract = {In this article we discuss our experience designing and implementing a statistical computing language. In developing this new language, we sought to combine what we felt were useful features from two existing computer languages. We feel that the new language provides advantages in the areas of portability, computational efficiency, memory management, and scoping.},
  author = {Ihaka, Ross and Gentleman, Robert},
  date = {1996-09},
  doi = {10/fdscw4},
  ids = {ihakaLanguageDataAnalysis1996a},
  issn = {1061-8600},
  journaltitle = {Journal of Computational and Graphical Statistics},
  keywords = {\#nosource},
  number = {3},
  pages = {299-314},
  shorttitle = {R},
  title = {R: {{A Language}} for {{Data Analysis}} and {{Graphics}}},
  volume = {5}
}

@article{Ingolia2011,
  abstract = {The ability to sequence genomes has far outstripped approaches for deciphering the information they encode. Here we present a suite of techniques, based on ribosome profiling (the deep sequencing of ribosome-protected mRNA fragments), to provide genome-wide maps of protein synthesis as well as a pulse-chase strategy for determining rates of translation elongation. We exploit the propensity of harringtonine to cause ribosomes to accumulate at sites of translation initiation together with a machine learning algorithm to define protein products systematically. Analysis of translation in mouse embryonic stem cells reveals thousands of strong pause sites and unannotated translation products. These include amino-terminal extensions and truncations and upstream open reading frames with regulatory potential, initiated at both AUG and non-AUG codons, whose translation changes after differentiation. We also define a class of short, polycistronic ribosome-associated coding RNAs (sprcRNAs) that encode small proteins. Our studies reveal an unanticipated complexity to mammalian proteomes.},
  author = {Ingolia, Nicholas T and Lareau, Liana F and Weissman, Jonathan S},
  date = {2011-11-11},
  doi = {10/cw3v9k},
  eprint = {22056041},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Ingolia et al. - 2011 - Ribosome profiling of mouse embryonic stem cells r.pdf},
  issn = {1097-4172},
  journaltitle = {Cell},
  keywords = {Algorithms,Animals,Artificial Intelligence,Embryoid Bodies,Embryoid Bodies: cytology,Embryoid Bodies: metabolism,Embryonic Stem Cells,Embryonic Stem Cells: metabolism,Genomics,Genomics: methods,Harringtonines,Harringtonines: pharmacology,High-Throughput Nucleotide Sequencing,High-Throughput Nucleotide Sequencing: methods,Kinetics,Mice,Open Reading Frames,Peptide Chain Initiation; Translational,Protein Biosynthesis,Ribosomes,Ribosomes: chemistry,Ribosomes: drug effects,RNA,RNA: analysis,Sequence Analysis; RNA,Sequence Analysis; RNA: methods},
  number = {4},
  pages = {789-802},
  title = {Ribosome Profiling of Mouse Embryonic Stem Cells Reveals the Complexity and Dynamics of Mammalian Proteomes.},
  volume = {147}
}

@article{Irizarry2003,
  author = {a. Irizarry, R.},
  date = {2003-02-15},
  doi = {10/cjwckj},
  file = {/Users/ryan/Documents/Zotero Library/Irizarry - 2003 - Summaries of Affymetrix GeneChip probe level data.pdf},
  issn = {13624962},
  journaltitle = {Nucleic Acids Research},
  number = {4},
  pages = {15e-15},
  title = {Summaries of {{Affymetrix GeneChip}} Probe Level Data},
  volume = {31}
}

@article{Irizarry2003a,
  abstract = {In this paper we report exploratory analyses of high-density oligonucleotide array data from the Affymetrix GeneChip system with the objective of improving upon currently used measures of gene expression. Our analyses make use of three data sets: a small experimental study consisting of five MGU74A mouse GeneChip arrays, part of the data from an extensive spike-in study conducted by Gene Logic and Wyeth's Genetics Institute involving 95 HG-U95A human GeneChip arrays; and part of a dilution study conducted by Gene Logic involving 75 HG-U95A GeneChip arrays. We display some familiar features of the perfect match and mismatch probe (PM and MM) values of these data, and examine the variance-mean relationship with probe-level data from probes believed to be defective, and so delivering noise only. We explain why we need to normalize the arrays to one another using probe level intensities. We then examine the behavior of the PM and MM using spike-in data and assess three commonly used summary measures: Affymetrix's (i) average difference (AvDiff) and (ii) MAS 5.0 signal, and (iii) the Li and Wong multiplicative model-based expression index (MBEI). The exploratory data analyses of the probe level data motivate a new summary measure that is a robust multi-array average (RMA) of background-adjusted, normalized, and log-transformed PM values. We evaluate the four expression summary measures using the dilution study data, assessing their behavior in terms of bias, variance and (for MBEI and RMA) model fit. Finally, we evaluate the algorithms in terms of their ability to detect known levels of differential expression using the spike-in data. We conclude that there is no obvious downside to using RMA and attaching a standard error (SE) to this quantity using a linear model which removes probe-specific affinities.},
  author = {a Irizarry, Rafael and Hobbs, Bridget and Collin, Francois and Beazer-Barclay, Yasmin D and Antonellis, Kristen J and Scherf, Uwe and Speed, Terence P},
  date = {2003-04},
  doi = {10/b9zc5m},
  eprint = {12925520},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Irizarry et al. - 2003 - Exploration, normalization, and summaries of high .pdf},
  issn = {1465-4644},
  journaltitle = {Biostatistics (Oxford, England)},
  keywords = {Algorithms,Animals,Data Interpretation; Statistical,DNA Probes,DNA Probes: genetics,Gene Expression Profiling,Gene Expression Profiling: statistics & numerical,Humans,Linear Models,Mice,Normal Distribution,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,Reproducibility of Results,Statistics; Nonparametric},
  number = {2},
  pages = {249-64},
  title = {Exploration, Normalization, and Summaries of High Density Oligonucleotide Array Probe Level Data.},
  volume = {4}
}

@article{Israeli2007,
  abstract = {The survival of a transplanted organ is dependent on maintenance of continuous immunosuppression. However, even the strictest adherence to the recommended drug levels does not prevent the occurrence of numerous complications associated with immunosuppression. The efficacy of immunosuppression therapy protocols would be enhanced greatly by the availability of biotechnologies capable of identifying and predicting immunological events prior to the manifestation of clinical parameters indicating graft failure. The aim of the study was to evaluate the potential contribution of some modern tools for post-transplantation monitoring, and to propose a method for combining them into a comprehensive mechanism for this purpose. The technologies utilized in this study are among a group of `cutting edge' diagnostic methods at the initial steps of evaluation for their potential contribution for post-transplantation immune monitoring. This study was a pioneering opportunity to combine and utilize these tools jointly. The method of research was based on monitoring 13 adult kidney transplant recipients. The Immuknow assay determined cellular immunity status by quantitative measurement of intracellular ATP level in CD4+ lymphocytes after PHA stimulation. Sera were analyzed for concentration of soluble CD30 reflecting primary allo-stimulation and for donor specific anti-HLA antibodies responsible for accelerated and refractory rejection. The results were correlated with clinical and pathological parameters and appraisal of predictive value was attempted. In Immuknow assay analysis ATP incremental changes indicative of rejection or infection were found in 75\% and in 50\% incidences, respectively. In stable patients, the ATP deviation from the preoperative baseline, indicative of stable engraftment, was much less pronounced than in other habitual clinical tests. CD30 concentrations were measured greatly above normal values prior to biopsy-proven rejection episodes, both before and after the transplant operation. Anti-HLA antibodies were elevated at a later stage, concurrently with clinical manifestation of graft failure and rejection. Anti-HLA antibody level remained negligible in patients going through a stable post-transplant clinical course. Overall, the utilization of the platform of combined biotechnologies could serve as a valuable tool for immune monitoring in organ transplantation, allowing for therapeutic intervention that can favorably affect the clinical outcome.},
  author = {Israeli, Moshe and Yussim, Alex and Mor, Eitan and Sredni, Benjamin and Klein, Tirza},
  date = {2007-07-01},
  doi = {10/dtdsbs},
  issn = {0966-3274},
  journaltitle = {Transplant Immunology},
  keywords = {\#nosource},
  number = {1},
  pages = {7-12},
  title = {Preceeding the Rejection: {{In}} Search for a Comprehensive Post-Transplant Immune Monitoring Platform},
  volume = {18}
}

@article{Jacob2010a,
  abstract = {We consider multivariate two-sample tests of means, where the location shift between the two populations is expected to be related to a known graph structure. An important application of such tests is the detection of differentially expressed genes between two patient populations, as shifts in expression levels are expected to be coherent with the structure of graphs reflecting gene properties such as biological process, molecular function, regulation, or metabolism. For a fixed graph of interest, we demonstrate that accounting for graph structure can yield more powerful tests under the assumption of smooth distribution shift on the graph. We also investigate the identification of non-homogeneous subgraphs of a given large graph, which poses both computational and multiple testing problems. The relevance and benefits of the proposed approach are illustrated on synthetic data and on breast cancer gene expression data analyzed in context of KEGG pathways.},
  archivePrefix = {arXiv},
  author = {Jacob, Laurent and Neuvial, Pierre and Dudoit, Sandrine},
  date = {2010-09-27},
  eprint = {1009.5173},
  eprinttype = {arxiv},
  file = {/Users/ryan/Documents/Zotero Library/Jacob et al. - 2010 - Gains in Power from Structured Two-Sample Tests of.pdf},
  pages = {1-24},
  title = {Gains in {{Power}} from {{Structured Two}}-{{Sample Tests}} of {{Means}} on {{Graphs}}},
  url = {http://arxiv.org/abs/1009.5173},
  urldate = {2014-03-21}
}

@article{Jaitin2014,
  author = {a. Jaitin, D. and Kenigsberg, E. and Keren-Shaul, H. and Elefant, N. and Paul, F. and Zaretsky, I. and Mildner, a. and Cohen, N. and Jung, S. and Tanay, a. and Amit, I.},
  date = {2014-02-13},
  doi = {10/f5rmjg},
  file = {/Users/ryan/Documents/Zotero Library/Jaitin et al. - 2014 - Massively Parallel Single-Cell RNA-Seq for Marker-.pdf;/Users/ryan/Documents/Zotero Library/Jaitin et al. - 2014 - Massively Parallel Single-Cell RNA-Seq for Marker-2.pdf},
  issn = {0036-8075},
  journaltitle = {Science},
  number = {6172},
  pages = {776-779},
  title = {Massively {{Parallel Single}}-{{Cell RNA}}-{{Seq}} for {{Marker}}-{{Free Decomposition}} of {{Tissues}} into {{Cell Types}}},
  volume = {343}
}

@book{James2013,
  abstract = {Statistical learning refers to a set of tools for modeling and understanding complex datasets. It is a recently developed area in statistics and blends with parallel developments in computer science and, in particular, machine learning. The field encompasses many methods such as the lasso and sparse regression, classification and regression trees, and boosting and support vector machines.},
  author = {James, Gareth and Witten, Daniela and Hastie, Trevor and Tibshirani, Robert},
  date = {2013},
  doi = {10/gcvjwx},
  edition = {6},
  file = {/Users/ryan/Documents/Zotero Library/James et al. - 2013 - An Introduction to Statistical Learning.pdf},
  isbn = {978-1-4614-7137-0},
  issn = {1431-875X},
  location = {{New York, NY}},
  note = {Series Title: Springer Texts in Statistics
Publication Title: Springer Texts in Statistics},
  pagetotal = {1-426},
  publisher = {{Springer New York}},
  title = {An {{Introduction}} to {{Statistical Learning}}},
  url = {http://link.springer.com/10.1007/978-1-4614-7138-7},
  volume = {103}
}

@article{Jeanmougin2010,
  abstract = {High-throughput post-genomic studies are now routinely and promisingly investigated in biological and biomedical research. The main statistical approach to select genes differentially expressed between two groups is to apply a t-test, which is subject of criticism in the literature. Numerous alternatives have been developed based on different and innovative variance modeling strategies. However, a critical issue is that selecting a different test usually leads to a different gene list. In this context and given the current tendency to apply the t-test, identifying the most efficient approach in practice remains crucial. To provide elements to answer, we conduct a comparison of eight tests representative of variance modeling strategies in gene expression data: Welch's t-test, ANOVA [1], Wilcoxon's test, SAM [2], RVM [3], limma [4], VarMixt [5] and SMVar [6]. Our comparison process relies on four steps (gene list analysis, simulations, spike-in data and re-sampling) to formulate comprehensive and robust conclusions about test performance, in terms of statistical power, false-positive rate, execution time and ease of use. Our results raise concerns about the ability of some methods to control the expected number of false positives at a desirable level. Besides, two tests (limma and VarMixt) show significant improvement compared to the t-test, in particular to deal with small sample sizes. In addition limma presents several practical advantages, so we advocate its application to analyze gene expression data.},
  author = {Jeanmougin, Marine and de Reynies, Aurelien and Marisa, Laetitia and Paccard, Caroline and Nuel, Gregory and Guedj, Mickael},
  date = {2010-01},
  doi = {10/frp7p2},
  eprint = {20838429},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Jeanmougin et al. - 2010 - Should we abandon the t-test in the analysis of ge.pdf},
  issn = {1932-6203},
  journaltitle = {PloS one},
  keywords = {Analysis of Variance,Computer Simulation,Gene Expression Profiling,Gene Expression Profiling: statistics & numerical,Models; Statistical,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: statistic},
  number = {9},
  options = {useprefix=true},
  pages = {e12336},
  title = {Should We Abandon the T-Test in the Analysis of Gene Expression Microarray Data: A Comparison of Variance Modeling Strategies.},
  volume = {5}
}

@article{Jiao2014,
  abstract = {Motivation: There is a growing number of studies generating matched Illumina Infinium HumanMethylation450 and gene expression data, yet there is a corresponding shortage of statistical tools aimed at their integrative analysis. Such integrative tools are important for the discovery of epigenetically regulated gene modules or molecular pathways, which play key roles in cellular differentiation and disease.Results: Here, we present a novel functional supervised algorithm, called Functional Epigenetic Modules (FEM), for the integrative analysis of Infinium 450k DNA methylation and matched or unmatched gene expression data. The algorithm identifies gene modules of coordinated differential methylation and differential expression in the context of a human interactome. We validate the FEM algorithm on simulated and real data, demonstrating how it successfully retrieves an epigenetically deregulated gene, previously known to drive endometrial cancer development. Importantly, in the same cancer, FEM identified a novel epigenetically deregulated hotspot, directly upstream of the well-known progesterone receptor tumour suppressor pathway. In the context of cellular differentiation, FEM successfully identifies known endothelial cell subtype-specific gene expression markers, as well as a novel gene module whose overexpression in blood endothelial cells is mediated by DNA hypomethylation. The systems-level integrative framework presented here could be used to identify novel key genes or signalling pathways, which drive cellular differentiation or disease through an underlying epigenetic mechanism.Availability and implementation: FEM is freely available as an R-package from http://sourceforge.net/projects/funepimod.Contact: andrew@picb.ac.cnSupplementary information: Supplementary data are available at Bioinformatics online.},
  author = {Jiao, Yinming and Widschwendter, Martin and Teschendorff, Andrew E.},
  date = {2014},
  doi = {10/f6jpf5},
  eprint = {24794928},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Jiao et al. - 2014 - A systems-level integrative framework for genome-w.pdf},
  isbn = {1367-4811},
  issn = {14602059},
  journaltitle = {Bioinformatics},
  number = {16},
  pages = {2360-2366},
  title = {A Systems-Level Integrative Framework for Genome-Wide {{DNA}} Methylation and Gene Expression Data Identifies Differential Gene Expression Modules under Epigenetic Control},
  volume = {30}
}

@article{Jin2007,
  abstract = {Nucleosomes containing the histone variant H3.3 tend to be clustered in vivo in the neighborhood of transcriptionally active genes and over regulatory elements. It has not been clear, however, whether H3.3-containing nucleosomes possess unique properties that would affect transcription. We report here that H3.3 nucleosomes isolated from vertebrates, regardless of whether they are partnered with H2A or H2A.Z, are unusually sensitive to salt-dependent disruption, losing H2A/H2B or H2A.Z/H2B dimers. Immunoprecipitation studies of nucleosome core particles (NCPs) show that NCPs that contain both H3.3 and H2A.Z are even less stable than NCPs containing H3.3 and H2A. Intriguingly, NCPs containing H3 and H2A.Z are at least as stable as H3/H2A NCPs. These results establish an hierarchy of stabilities for native nucleosomes carrying different complements of variants, and suggest how H2A.Z could play different roles depending on its partners within the NCP. They also are consistent with the idea that H3.3 plays an active role in maintaining accessible chromatin structures in enhancer regions and transcribed regions. Consistent with this idea, promoters and enhancers at transcriptionally active genes and coding regions at highly expressed genes have nucleosomes that simultaneously carry both H3.3 and H2A.Z, and should therefore be extremely sensitive to disruption. \textcopyright{} 2007 by Cold Spring Harbor Laboratory Press.},
  author = {Jin, Chunyuan and Felsenfeld, Gary},
  date = {2007},
  doi = {10/bwwqb3},
  eprint = {17575053},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Jin and Felsenfeld - 2007 - Nucleosome stability mediated by histone variants .pdf},
  issn = {08909369},
  journaltitle = {Genes and Development},
  keywords = {Histone H2A.Z,Histone H3.3,Nucleosome structure},
  number = {12},
  pages = {1519-1529},
  title = {Nucleosome Stability Mediated by Histone Variants {{H3}}.3 and {{H2A}}.{{Z}}},
  volume = {21}
}

@article{Jin2009,
  abstract = {To understand how chromatin structure is organized by different histone variants, we have measured the genome-wide distribution of nucleosome core particles (NCPs) containing the histone variants H3.3 and H2A.Z in human cells. We find that a special class of NCPs containing both variants is enriched at 'nucleosome-free regions' of active promoters, enhancers and insulator regions. We show that preparative methods used previously in studying nucleosome structure result in the loss of these unstable double-variant NCPs. It seems likely that this instability facilitates the access of transcription factors to promoters and other regulatory sites in vivo. Other combinations of variants have different distributions, consistent with distinct roles for histone variants in the modulation of gene expression. \textcopyright{} 2009 Nature America, Inc. All rights reserved.},
  author = {Jin, Chunyuan and Zang, Chongzhi and Wei, Gang and Cui, Kairong and Peng, Weiqun and Zhao, Keji and Felsenfeld, Gary},
  date = {2009-08-26},
  doi = {10/bqfzgv},
  file = {/Users/ryan/Documents/Zotero Library/Jin et al. - 2009 - H3.3H2A.Z double variant-containing nucleosomes m.pdf},
  issn = {10614036},
  journaltitle = {Nature Genetics},
  number = {8},
  pages = {941-945},
  title = {H3.3/{{H2A}}.{{Z}} Double Variant-Containing Nucleosomes Mark 'nucleosome-Free Regions' of Active Promoters and Other Regulatory Regions},
  volume = {41}
}

@article{Joe2005a,
  author = {Joe, Harry and Zhu, Rong},
  date = {2005-04},
  doi = {10/b6h5gh},
  file = {/Users/ryan/Documents/Zotero Library/Joe and Zhu - 2005 - Generalized Poisson Distribution the Property of .pdf},
  issn = {0323-3847},
  journaltitle = {Biometrical Journal},
  keywords = {overdispersion,poisson mixture,skewness,zero-inflated distribution},
  number = {2},
  pages = {219-229},
  title = {Generalized {{Poisson Distribution}}: The {{Property}} of {{Mixture}} of {{Poisson}} and {{Comparison}} with {{Negative Binomial Distribution}}},
  volume = {47}
}

@article{Johnson2007,
  abstract = {Non-biological experimental variation or "batch effects" are commonly observed across multiple batches of microarray experiments, often rendering the task of combining data from these batches difficult. The ability to combine microarray data sets is advantageous to researchers to increase statistical power to detect biological phenomena from studies where logistical considerations restrict sample size or in studies that require the sequential hybridization of arrays. In general, it is inappropriate to combine data sets without adjusting for batch effects. Methods have been proposed to filter batch effects from data, but these are often complicated and require large batch sizes ( {$>$} 25) to implement. Because the majority of microarray studies are conducted using much smaller sample sizes, existing methods are not sufficient. We propose parametric and non-parametric empirical Bayes frameworks for adjusting data for batch effects that is robust to outliers in small sample sizes and performs comparable to existing methods for large samples. We illustrate our methods using two example data sets and show that our methods are justifiable, easy to apply, and useful in practice. Software for our method is freely available at: http://biosun1.harvard.edu/complab/batch/.},
  author = {Johnson, W Evan and Li, Cheng and Rabinovic, Ariel},
  date = {2007-01-01},
  doi = {10/dsf386},
  eprint = {16632515},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Johnson et al. - 2007 - Adjusting batch effects in microarray expression d.pdf},
  issn = {1468-4357},
  journaltitle = {Biostatistics},
  keywords = {Bayes Theorem,Data Interpretation,Gene Expression Profiling,Gene Expression Profiling: methods,Humans,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,Statistical},
  number = {1},
  pages = {118-127},
  title = {Adjusting Batch Effects in Microarray Expression Data Using Empirical {{Bayes}} Methods},
  volume = {8}
}

@article{Jones2016,
  author = {Jones, Daniel C and Kuppusamy, Kavitha T and Palpant, Nathan J and Peng, Xinxia and Charles, E and Ruohola-baker, Hannele and Ruzzo, Walter L},
  date = {2016},
  doi = {10/ggcxkp},
  file = {/Users/ryan/Documents/Zotero Library/Jones et al. - 2016 - Isolator  accurate and stable analysis of isoform.pdf},
  title = {Isolator : Accurate and Stable Analysis of Isoform-Level Expression in {{RNA}}-{{Seq}} Experiments}
}

@article{Joyce2012,
  author = {Joyce, Gerald F},
  date = {2012-04-20},
  doi = {10/ggcxkq},
  eprint = {22517850},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Joyce - 2012 - Evolution. Toward an alternative biology..pdf},
  issn = {1095-9203},
  journaltitle = {Science (New York, N.Y.)},
  keywords = {Aptamers; Nucleotide,Aptamers; Nucleotide: chemistry,Aptamers; Nucleotide: metabolism,Evolution; Molecular,Molecular Mimicry,Nucleic Acids,Nucleic Acids: chemistry,Polymers,Polymers: chemistry},
  number = {6079},
  pages = {307-8},
  title = {Evolution. {{Toward}} an Alternative Biology.},
  volume = {336}
}

@article{Kadota2012,
  abstract = {BACKGROUND: High-throughput sequencing, such as ribonucleic acid sequencing (RNA-seq) and chromatin immunoprecipitation sequencing (ChIP-seq) analyses, enables various features of organisms to be compared through tag counts. Recent studies have demonstrated that the normalization step for RNA-seq data is critical for a more accurate subsequent analysis of differential gene expression. Development of a more robust normalization method is desirable for identifying the true difference in tag count data.

RESULTS: We describe a strategy for normalizing tag count data, focusing on RNA-seq. The key concept is to remove data assigned as potential differentially expressed genes (DEGs) before calculating the normalization factor. Several R packages for identifying DEGs are currently available, and each package uses its own normalization method and gene ranking algorithm. We compared a total of eight package combinations: four R packages (edgeR, DESeq, baySeq, and NBPSeq) with their default normalization settings and with our normalization strategy. Many synthetic datasets under various scenarios were evaluated on the basis of the area under the curve (AUC) as a measure for both sensitivity and specificity. We found that packages using our strategy in the data normalization step overall performed well. This result was also observed for a real experimental dataset.

CONCLUSION: Our results showed that the elimination of potential DEGs is essential for more accurate normalization of RNA-seq data. The concept of this normalization strategy can widely be applied to other types of tag count data and to microarray data.},
  author = {Kadota, Koji and Nishiyama, Tomoaki and Shimizu, Kentaro},
  date = {2012-01},
  doi = {10/gcbq6m},
  eprint = {22475125},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Kadota et al. - 2012 - A normalization strategy for comparing tag count d.pdf},
  issn = {1748-7188},
  journaltitle = {Algorithms for molecular biology : AMB},
  number = {1},
  pages = {5},
  title = {A Normalization Strategy for Comparing Tag Count Data.},
  volume = {7}
}

@article{Kapourani2018,
  abstract = {Measurements of single-cell methylation are revolutionizing our understanding of epigenetic control of gene expression, yet the intrinsic data sparsity limits the scope for quantitative analysis of such data. Here, we introduce Melissa (MEthyLation Inference for Single cell Analysis), a Bayesian hierarchical method to cluster cells based on local methylation patterns, discovering patterns of epigenetic variability between cells. The clustering also acts as an effective regularization for data imputation on unassayed CpG sites, enabling transfer of information between individual cells. We show both on simulated and real data sets that Melissa provides accurate and biologically meaningful clusterings and state-of-the-art imputation performance.},
  author = {Kapourani, Chantriolnt-Andreas and Sanguinetti, Guido},
  date = {2019-03-21},
  doi = {10/gfxdhh},
  file = {/Users/ryan/Documents/Zotero Library/Kapourani and Sanguinetti - 2019 - Melissa Bayesian clustering and imputation of sin.pdf;/Users/ryan/Documents/Zotero Library/Kapourani and Sanguinetti - 2019 - Melissa Bayesian clustering and imputation of sin2.pdf;/Users/ryan/Zotero/storage/D96XF48U/s13059-019-1665-8.html},
  ids = {Kapourani2018},
  issn = {1474-760X},
  journaltitle = {Genome Biology},
  number = {1},
  pages = {61},
  shortjournal = {Genome Biology},
  shorttitle = {Melissa},
  title = {Melissa: {{Bayesian}} Clustering and Imputation of Single-Cell Methylomes},
  volume = {20}
}

@article{Kapourani2018b,
  abstract = {Motivation: High-throughput measurements of DNA methylation are increasingly becoming a mainstay of biomedical investigations. While the methylation status of individual cytosines can sometimes be informative, several recent papers have shown that the functional role of DNA methylation is better captured by a quantitative analysis of the spatial variation of methylation across a genomic region. Results: Here, we present BPRMeth, a Bioconductor package that quantifies methylation profiles by generalized linear model regression. The original implementation has been enhanced in two important ways: we introduced a fast, variational inference approach that enables the quantification of Bayesian posterior confidence measures on the model, and we adapted the method to use several observation models, making it suitable for a diverse range of platforms including single-cell analyses and methylation arrays. Availability and implementation: http://bioconductor.org/packages/BPRMeth. Supplementary information: Supplementary data are available at Bioinformatics online.},
  author = {Kapourani, Chantriolnt Andreas and Sanguinetti, Guido},
  date = {2018},
  doi = {10/gdtpws},
  file = {/Users/ryan/Documents/Zotero Library/Kapourani and Sanguinetti - 2018 - BPRMeth a flexible Bioconductor package for model.pdf},
  issn = {13674811},
  journaltitle = {Bioinformatics (Oxford, England)},
  number = {14},
  pages = {2485-2486},
  title = {{{BPRMeth}}: A Flexible {{Bioconductor}} Package for Modelling Methylation Profiles},
  volume = {34}
}

@article{Kasowski2010,
  author = {Kasowski, Maya and Grubert, Fabian and Heffelfinger, Christopher and Hariharan, Manoj and Asabere, Akwasi and Waszak, Sebastian M and Habegger, Lukas and Rozowsky, Joel and Shi, Minyi and Urban, Alexander E and Hong, Mi-Young and Karczewski, Konrad J and Huber, Wolfgang and Weissman, Sherman M and Gerstein, Mark B and Korbel, Jan O and Snyder, Michael},
  date = {2010-04-09},
  doi = {10/cbft6k},
  file = {/Users/ryan/Documents/Zotero Library/Kasowski et al. - 2010 - Variation in Transcription Factor Binding Among Hu.pdf},
  ids = {kasowskiVariationTranscriptionFactor2010},
  issn = {0036-8075, 1095-9203},
  journaltitle = {Science},
  langid = {english},
  number = {5975},
  pages = {232-235},
  shortjournal = {Science},
  title = {Variation in {{Transcription Factor Binding Among Humans}}},
  volume = {328}
}

@article{Kauffmann2009,
  abstract = {SUMMARY: The assessment of data quality is a major concern in microarray analysis. arrayQualityMetrics is a Bioconductor package that provides a report with diagnostic plots for one or two colour microarray data. The quality metrics assess reproducibility, identify apparent outlier arrays and compute measures of signal-to-noise ratio. The tool handles most current microarray technologies and is amenable to use in automated analysis pipelines or for automatic report generation, as well as for use by individuals. The diagnosis of quality remains, in principle, a context-dependent judgement, but our tool provides powerful, automated, objective and comprehensive instruments on which to base a decision. AVAILABILITY: arrayQualityMetrics is a free and open source package, under LGPL license, available from the Bioconductor project at www.bioconductor.org. A users guide and examples are provided with the package. Some examples of HTML reports generated by arrayQualityMetrics can be found at http://www.microarray-quality.org},
  archivePrefix = {arXiv},
  author = {Kauffmann, Audrey and Gentleman, Robert and Huber, Wolfgang},
  date = {2009-02-01},
  doi = {10/bzgps2},
  eprint = {19106121},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Kauffmann et al. - 2009 - arrayQualityMetrics--a bioconductor package for qu.pdf},
  issn = {1367-4811},
  journaltitle = {Bioinformatics (Oxford, England)},
  keywords = {Algorithms,Computational Biology,Computational Biology: methods,Data Interpretation; Statistical,Internet,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,Oligonucleotide Array Sequence Analysis: standards,Quality Control,Software},
  number = {3},
  pages = {415-6},
  title = {{{arrayQualityMetrics}}--a Bioconductor Package for Quality Assessment of Microarray Data.},
  volume = {25}
}

@article{Kechris2010,
  abstract = {High density tiling arrays are an effective strategy for genome-wide identification of transcription factor binding regions. Sliding window methods that calculate moving averages of log ratios or t-statistics have been useful for the analysis of tiling array data. Here, we present a method that generalizes the moving average approach to evaluate sliding windows of p-values by using combined p-value statistics. In particular, the combined p-value framework can be useful in situations when taking averages of the corresponding test-statistic for the hypothesis may not be appropriate or when it is difficult to assess the significance of these averages. We exhibit the strengths of the combined p-values methods on Drosophila tiling array data and assess their ability to predict genomic regions enriched for transcription factor binding. The predictions are evaluated based on their proximity to target genes and their enrichment of known transcription factor binding sites. We also present an application for the generalization of the moving average based on integrating two different tiling array experiments.},
  author = {Kechris, Katerina J and Biehs, Brian and Kornberg, Thomas B},
  date = {2010},
  doi = {10/cj5d7w},
  eprint = {20812907},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Kechris et al. - 2010 - Generalizing moving averages for tiling arrays usi.pdf},
  issn = {1544-6115},
  journaltitle = {Statistical applications in genetics and molecular biology},
  number = {1},
  pages = {Article29},
  title = {Generalizing Moving Averages for Tiling Arrays Using Combined P-Value Statistics.},
  volume = {9}
}

@article{Kent2003a,
  author = {Kent, W. J. and Baertsch, R. and Hinrichs, A. and Miller, W. and Haussler, D.},
  date = {2003-09-30},
  doi = {10/dtnhbb},
  file = {/Users/ryan/Documents/Zotero Library/Kent et al. - 2003 - Evolution's cauldron Duplication, deletion, and r.pdf},
  ids = {Kent2003a},
  issn = {0027-8424, 1091-6490},
  journaltitle = {Proceedings of the National Academy of Sciences},
  langid = {english},
  number = {20},
  pages = {11484-11489},
  shortjournal = {Proceedings of the National Academy of Sciences},
  shorttitle = {Evolution's Cauldron},
  title = {Evolution's Cauldron: {{Duplication}}, Deletion, and Rearrangement in the Mouse and Human Genomes},
  volume = {100}
}

@article{Kerr2000,
  abstract = {Spotted cDNA microarrays are emerging as a powerful and cost-effective tool for large-scale analysis of gene expression. Microarrays can be used to measure the relative quantities of specific mRNAs in two or more tissue samples for thousands of genes simultaneously. While the power of this technology has been recognized, many open questions remain about appropriate analysis of microarray data. One question is how to make valid estimates of the relative expression for genes that are not biased by ancillary sources of variation. Recognizing that there is inherent "noise" in microarray data, how does one estimate the error variation associated with an estimated change in expression, i.e., how does one construct the error bars? We demonstrate that ANOVA methods can be used to normalize microarray data and provide estimates of changes in gene expression that are corrected for potential confounding effects. This approach establishes a framework for the general analysis and interpretation of microarray data.},
  author = {Kerr, M K and Martin, M and a Churchill, G},
  date = {2000-01},
  doi = {10/d4mc5j},
  eprint = {11382364},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Kerr et al. - 2000 - Analysis of variance for gene expression microarra.pdf},
  issn = {1066-5277},
  journaltitle = {Journal of computational biology : a journal of computational molecular cell biology},
  keywords = {Computer-Assisted,Female,Humans,Image Processing,Least-Squares Analysis,Liver,Liver: physiology,Male,Muscle,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,Placenta,Placenta: physiology,Pregnancy,Reproducibility of Results,Skeletal,Skeletal: physiology},
  number = {6},
  pages = {819-37},
  title = {Analysis of Variance for Gene Expression Microarray Data.},
  volume = {7}
}

@article{Kharchenko2008,
  abstract = {Critical considerations in the design and analysis of ChIP-seq experiments include how to align sequenced tags to the genome, how to detect binding sites and how to estimate the number of tags needed to confidently determine where a protein binds DNA. Using data set for three transcription factors, Kharchenko et al. address these considerations by comparing three novel algorithms with published computational methods.},
  author = {Kharchenko, Peter V. and Tolstorukov, Michael Y. and Park, Peter J.},
  date = {2008-12},
  doi = {10/d2rbh7},
  file = {/Users/ryan/Documents/Zotero Library/Kharchenko et al. - 2008 - Design and analysis of ChIP-seq experiments for DN.pdf;/Users/ryan/Zotero/storage/4JEFI8LB/nbt.html;/Users/ryan/Zotero/storage/7LP8ZHK6/SPP Tutorial (2019-11-15 7_53_30 AM).html},
  ids = {Kharchenko2008},
  issn = {1546-1696},
  journaltitle = {Nature Biotechnology},
  keywords = {SPP},
  langid = {english},
  number = {12},
  pages = {1351-1359},
  shortjournal = {Nat Biotechnol},
  title = {Design and Analysis of {{ChIP}}-Seq Experiments for {{DNA}}-Binding Proteins},
  volume = {26}
}

@article{Kharchenko2014,
  abstract = {Single-cell data provide a means to dissect the composition of complex tissues and specialized cellular environments. However, the analysis of such measurements is complicated by high levels of technical noise and intrinsic biological variability. We describe a probabilistic model of expression-magnitude distortions typical of single-cell RNA-sequencing measurements, which enables detection of differential expression signatures and identification of subpopulations of cells in a way that is more tolerant of noise.},
  author = {Kharchenko, Peter V and Silberstein, Lev and Scadden, David T},
  date = {2014},
  doi = {10/gfgp7d},
  eprint = {24836921},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Kharchenko et al. - 2014 - Bayesian approach to single-cell differential expr.pdf},
  isbn = {1548-7091},
  issn = {1548-7105},
  journaltitle = {Nature methods},
  number = {7},
  pages = {740-2},
  title = {Bayesian Approach to Single-Cell Differential Expression Analysis.},
  volume = {11}
}

@article{Kikutake2016,
  abstract = {Epigenetic mechanisms such as DNA methylation or histone modifications are essential for the regulation of gene expression and development of tissues. Alteration of epigenetic modifications can be used as an epigenetic biomarker for diagnosis and as promising targets for epigenetic therapy. A recent study explored cancer-cell specific epigenetic biomarkers by examining different types of epigenetic modifications simultaneously. However, it was based on microarrays and reported biomarkers that were also present in normal cells at a low frequency. Here, we first analyzed multi-omics data (including ChIP-Seq data of six types of histone modifications: H3K27ac, H3K4me1, H3K9me3, H3K36me3, H3K27me3, and H3K4me3) obtained from 26 lung adenocarcinoma cell lines and a normal cell line. We identified six genes with both H3K27ac and H3K4me3 histone modifications in their promoter regions, which were not present in the normal cell line, but present in {$\geq$}85\% (22 out of 26) and {$\leq$}96\% (25 out of 26) of the lung adenocarcinoma cell lines. Of these genes, NUP210 (encoding a main component of the nuclear pore complex) was the only gene in which the two modifications were not detected in another normal cell line. RNA-Seq analysis revealed that NUP210 was aberrantly overexpressed among the 26 lung adenocarcinoma cell lines, although the frequency of NUP210 overexpression was lower (19.3\%) in 57 lung adenocarcinoma tissue samples studied and stored in another database. This study provides a basis to discover epigenetic biomarkers highly specific to a certain cancer, based on multi-omics data at the cell population level.},
  author = {Kikutake, Chie and Yahara, Koji},
  date = {2016},
  doi = {10/ggcxkr},
  eprint = {27042856},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Kikutake and Yahara - 2016 - Identification of epigenetic biomarkers of lung ad.pdf},
  issn = {19326203},
  journaltitle = {PLoS ONE},
  number = {4},
  pages = {1-20},
  title = {Identification of Epigenetic Biomarkers of Lung Adenocarcinoma through Multi-Omics Data Analysis},
  volume = {11}
}

@article{Kim2012,
  abstract = {The 15 known Fanconi anemia proteins cooperate in a pathway that regulates DNA interstrand cross-link repair. Recent studies indicate that the Fanconi anemia pathway also controls Rev1-mediated translesion DNA synthesis (TLS). We identified Fanconi anemia-associated protein (FAAP20), an integral subunit of the multisubunit Fanconi anemia core complex. FAAP20 binds to FANCA subunit and is required for stability of the complex and monoubiquitination of FANCD2. FAAP20 contains a ubiquitin-binding zinc finger 4 domain and binds to the monoubiquitinated form of Rev1. FAAP20 binding stabilizes Rev1 nuclear foci and promotes interaction of the Fanconi anemia core with PCNA-Rev1 DNA damage bypass complexes. FAAP20 therefore provides a critical link between the Fanconi anemia pathway and TLS polymerase activity. We propose that the Fanconi anemia core complex regulates cross-link repair by channeling lesions to damage bypass pathways and preventing large DNA insertions and deletions.},
  author = {Kim, Hyungjin and Yang, Kailin and Dejsuphong, Donniphat and D'Andrea, Alan D},
  date = {2012-02-22},
  doi = {10/fzhhff},
  eprint = {22266823},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Kim et al. - 2012 - Regulation of Rev1 by the Fanconi anemia core comp.pdf},
  isbn = {6176325757},
  issn = {1545-9993},
  journaltitle = {Nature Structural \& Molecular Biology},
  keywords = {Fanconi Anemia Complementation Group Proteins,Fanconi Anemia Complementation Group Proteins: met,Gene Expression Regulation,Nuclear Proteins,Nuclear Proteins: metabolism,Nucleotidyltransferases,Nucleotidyltransferases: metabolism,Protein Binding,Protein Stability},
  number = {2},
  pages = {164-170},
  title = {Regulation of {{Rev1}} by the {{Fanconi}} Anemia Core Complex},
  volume = {19}
}

@article{Kim2013,
  author = {Kim, Daehwan and Pertea, Geo and Trapnell, Cole},
  date = {2013},
  doi = {10/gf3xsm},
  journaltitle = {Genome \ldots{}},
  number = {4},
  pages = {R36},
  title = {{{TopHat2}}: Accurate Alignment of Transcriptomes in the Presence of Insertions, Deletions and Gene Fusions},
  volume = {14}
}

@article{Kim2014,
  abstract = {Solid organ transplantation has transformed the lives of many children and adults by providing treatment for patients with organ failure who would have otherwise succumbed to their disease. The first successful transplant in 1954 was a kidney transplant between identical twins, which circumvented the problem of rejection from MHC incompatibility. Further progress in solid organ transplantation was enabled by the discovery of immunosuppressive agents such as corticosteroids and azathioprine in the 1950s and ciclosporin in 1970. Today, solid organ transplantation is a conventional treatment with improved patient and allograft survival rates. However, the challenge that lies ahead is to extend allograft survival time while simultaneously reducing the side effects of immunosuppression. This is particularly important for children who have irreversible organ failure and may require multiple transplants. Pediatric transplant teams also need to improve patient quality of life at a time of physical, emotional and psychosocial development. This review will elaborate on the long-term outcomes of children after kidney, liver, heart, lung and intestinal transplantation. As mortality rates after transplantation have declined, there has emerged an increased focus on reducing longer-term morbidity with improved outcomes in optimizing cardiovascular risk, renal impairment, growth and quality of life. Data were obtained froma review of the literature and particularly fromnational registries and databases such as the North American Pediatric Renal Trials and Collaborative Studies for the kidney, SPLIT for liver, International Society for Heart and Lung Transplantation and UNOS for intestinal transplantation. \textcopyright{} 2014 CLINICS.},
  author = {Kim, Jon Jin and Marks, Stephen D.},
  date = {2014-01-15},
  doi = {10/ggcxks},
  eprint = {24860856},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Kim and Marks - 2014 - Long-term outcomes of children after solid organ t.pdf},
  issn = {18075932},
  issue = {SUPPL.1},
  journaltitle = {Clinics},
  keywords = {Cardiovascular,Kidney function,Morbidity,Quality Of life,Survival},
  pages = {28-38},
  title = {Long-Term Outcomes of Children after Solid Organ Transplantation},
  volume = {69}
}

@article{Kim2019,
  abstract = {The human reference genome represents only a small number of individuals, which limits its usefulness for genotyping. We present a method named HISAT2 (hierarchical indexing for spliced alignment of transcripts 2) that can align both DNA and RNA sequences using a graph Ferragina Manzini index. We use HISAT2 to represent and search an expanded model of the human reference genome in which over 14.5 million genomic variants in combination with haplotypes are incorporated into the data structure used for searching and alignment. We benchmark HISAT2 using simulated and real datasets to demonstrate that our strategy of representing a population of genomes, together with a fast, memory-efficient search algorithm, provides more detailed and accurate variant analyses than other methods. We apply HISAT2 for HLA typing and DNA fingerprinting; both applications form part of the HISAT-genotype software that enables analysis of haplotype-resolved genes or genomic regions. HISAT-genotype outperforms other computational methods and matches or exceeds the performance of laboratory-based assays. A graph-based genome indexing scheme enables variant-aware alignment of sequences with very low memory requirements.},
  author = {Kim, Daehwan and Paggi, Joseph M. and Park, Chanhee and Bennett, Christopher and Salzberg, Steven L.},
  date = {2019},
  doi = {10/gf5395},
  file = {/Users/ryan/Documents/Zotero Library/Kim et al. - 2019 - Graph-based genome alignment and genotyping with H.pdf},
  isbn = {4158701902},
  issn = {1087-0156},
  journaltitle = {Nature Biotechnology},
  number = {8},
  pages = {907-915},
  title = {Graph-Based Genome Alignment and Genotyping with {{HISAT2}} and {{HISAT}}-Genotype},
  volume = {37}
}

@article{Klein2014,
  abstract = {MOTIVATION: Histone modifications are a key epigenetic mechanism to activate or repress the transcription of genes. Datasets of matched transcription data and histone modification data obtained by ChIP-seq exist, but methods for integrative analysis of both data types are still rare. Here, we present a novel bioinformatics approach to detect genes that show different transcript abundances between two conditions putatively caused by alterations in histone modification. RESULTS: We introduce a correlation measure for integrative analysis of ChIP-seq and gene transcription data measured by RNA sequencing or microarrays and demonstrate that a proper normalization of ChIP-seq data is crucial. We suggest applying Bayesian mixture models of different types of distributions to further study the distribution of the correlation measure. The implicit classification of the mixture models is used to detect genes with differences between two conditions in both gene transcription and histone modification. The method is applied to different datasets, and its superiority to a naive separate analysis of both data types is demonstrated.Availability and implementation: R/Bioconductor package epigenomix. CONTACT: h.klein@uni-muenster.de SUPPLEMENTARY INFORMATION: Supplementary data are available at Bioinformatics online.},
  author = {Klein, Hans-Ulrich and Sch\"afer, Martin and Porse, Bo T. and Hasemann, Marie S. and Ickstadt, Katja and Dugas, Martin},
  date = {2014-04-15},
  doi = {10/f5x88w},
  eprint = {24403540},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Klein et al. - 2014 - Integrative analysis of histone ChIP-seq and trans.pdf},
  isbn = {1367-4811 (Electronic)\textbackslash{}r1367-4803 (Linking)},
  issn = {1460-2059},
  journaltitle = {Bioinformatics},
  number = {8},
  pages = {1154-1162},
  title = {Integrative Analysis of Histone {{ChIP}}-Seq and Transcription Data Using {{Bayesian}} Mixture Models},
  volume = {30}
}

@article{Klein2015,
  abstract = {In the biology of tissue development and diseases, DNA methylation plays an important role. For a deeper understanding, it is crucial to accurately compare DNA methylation patterns between groups of samples representing different conditions. A widely used method to investigate DNA methylation in the CpG context is bisulfite sequencing, which produces data on the single-nucleotide scale. While there are benefits to analyzing CpG sites on a basepair level, there are both biological and statistical reasons to test entire genomic regions for differential methylation. However, the analysis of DNA methylation is hampered by the lack of best practice standards. Here, we compared multiple approaches for testing predefined genomic regions for differential DNA methylation in bisulfite sequencing data. Nine methods were evaluated: BiSeq, COHCAP, Goeman's Global Test, Limma, methylKit/eDMR, RADMeth and three log-linear regression approaches with different distribution assumptions. We applied these methods to simulated data and determined their sensitivity and specificity. This revealed performance differences, which were also seen when applied to real data. Methods that first test single CpG sites and then test regions based on transformed CpG-wise P-values performed better than methods that summarize methylation levels or raw reads. Interestingly, smoothing of methylation levels had a negligible impact. In particular, Global Test, BiSeq and RADMeth/z-test outperformed the other methods we evaluated, providing valuable guidance for more accurate analysis of DNA methylation.},
  author = {Klein, Hans-Ulrich and Hebestreit, Katja},
  date = {2015},
  doi = {10/f89drz},
  eprint = {26515532},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Klein and Hebestreit - 2015 - An evaluation of methods to test predefined genomi.pdf},
  isbn = {1477-4054 (Electronic)\textbackslash{}r1467-5463 (Linking)},
  issn = {1467-5463},
  issue = {September},
  journaltitle = {Briefings in Bioinformatics},
  keywords = {★,bisulfite sequencing,differentially methylated regions,dna methylation},
  pages = {bbv095},
  title = {An Evaluation of Methods to Test Predefined Genomic Regions for Differential Methylation in Bisulfite Sequencing Data}
}

@inproceedings{Kleinberg2002,
  abstract = {Although the study of clustering is centered around an intuitively compelling goal, it has been very difficult to develop a unified framework for reasoning about it at a technical level, and profoundly diverse approaches to clustering abound in the research community. Here we suggest a formal perspective on the difficulty in finding such a unification, in the form of an impossibility theorem: for a set of three simple properties, we show that there is no clustering function satisfying all three. Relaxations of these properties expose some of the interesting (and unavoidable) trade-offs at work in well-studied clustering techniques such as single-linkage, sum-of-pairs, k-means, and k-median.},
  author = {Kleinberg, Jon},
  booktitle = {Advances in {{Neural Information Processing Systems}}},
  date = {2003-07},
  file = {/Users/ryan/Documents/Zotero Library/Kleinberg - 2003 - An impossibility theorem for clustering.pdf},
  isbn = {0-262-02550-7},
  issn = {10495258},
  location = {{Cambridge, MA, USA}},
  pages = {463-470},
  publisher = {{MIT Press}},
  title = {An Impossibility Theorem for Clustering},
  url = {http://dl.acm.org/citation.cfm?id=2968618.2968676}
}

@article{Komori2011,
  abstract = {Cytosine methylation of DNA CpG dinucleotides in gene promoters is an epigenetic modification that regulates gene transcription. While many methods exist to interrogate methylation states, few current methods offer large-scale, targeted, single CpG resolution. We report an approach combining bisulfite treatment followed by microdroplet PCR with next-generation sequencing to assay the methylation state of 50 genes in the regions 1 kb upstream of and downstream from their transcription start sites. This method yielded 96\% coverage of the targeted CpGs and demonstrated high correlation between CpG island (CGI) DNA methylation and transcriptional regulation. The method was scaled to interrogate the methylation status of 77,674 CpGs in the promoter regions of 2100 genes in primary CD4 T cells. The 2100 gene library yielded 97\% coverage of all targeted CpGs and 99\% of the target amplicons.},
  author = {Komori, H Kiyomi and a LaMere, Sarah and Torkamani, Ali and Hart, G Traver and Kotsopoulos, Steve and Warner, Jason and Samuels, Michael L and Olson, Jeff and Head, Steven R and Ordoukhanian, Phillip and Lee, Pauline L and Link, Darren R and Salomon, Daniel R},
  date = {2011-10},
  doi = {10/bmvzwj},
  eprint = {21757609},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Komori et al. - 2011 - Application of microdroplet PCR for large-scale ta.pdf},
  issn = {1549-5469},
  journaltitle = {Genome research},
  keywords = {Base Sequence,CpG Islands,DNA,DNA Methylation,DNA Primers,DNA Primers: chemistry,DNA: chemistry,DNA: genetics,Epigenesis; Genetic,High-Throughput Nucleotide Sequencing,High-Throughput Nucleotide Sequencing: methods,Humans,Jurkat Cells,Microchemistry,Microchemistry: methods,Polymerase Chain Reaction,Polymerase Chain Reaction: methods,Promoter Regions; Genetic,Sequence Analysis; DNA,Sequence Analysis; DNA: methods,Sulfites,Sulfites: chemistry},
  number = {10},
  pages = {1738-45},
  title = {Application of Microdroplet {{PCR}} for Large-Scale Targeted Bisulfite Sequencing.},
  volume = {21}
}

@article{Koster2012,
  abstract = {SUMMARY: Snakemake is a workflow engine that provides a readable Python-based workflow definition language and a powerful execution environment that scales from single-core workstations to compute clusters without modifying the workflow. It is the first system to support the use of automatically inferred multiple named wildcards (or variables) in input and output filenames.\textbackslash{}n\textbackslash{}nAVAILABILITY: http://snakemake.googlecode.com.\textbackslash{}n\textbackslash{}nCONTACT: johannes.koester@uni-due.de.},
  author = {K\"oster, Johannes and Rahmann, Sven},
  date = {2012},
  doi = {10/gd2xzq},
  eprint = {22908215},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Köster and Rahmann - 2012 - Snakemake-a scalable bioinformatics workflow engin.pdf},
  isbn = {1367-4811 (Linking)},
  issn = {13674803},
  journaltitle = {Bioinformatics},
  number = {19},
  pages = {2520-2522},
  title = {Snakemake-a Scalable Bioinformatics Workflow Engine},
  volume = {28}
}

@article{Kostka2008,
  abstract = {Microarray gene expression signatures hold great promise to improve diagnosis and prognosis of disease. However, current documentation standards of such signatures do not allow for an unambiguous application to study-external patients. This hinders independent evaluation, effectively delaying the use of signatures in clinical practice. Data from eight publicly available clinical microarray studies were analyzed and the consistency of study-internal with study-external diagnoses was evaluated. Study-external classifications were based on documented information only. Documenting a signature is conceptually different from reporting a list of genes. We show that even the exact quantitative specification of a classification rule alone does not define a signature unambiguously. We found that discrepancy between study-internal and study-external diagnoses can be as frequent as 30\% (worst case) and 18\% (median). By using the proposed documentation by value strategy, which documents quantitative preprocessing information, the median discrepancy was reduced to 1\%. The process of evaluating microarray gene expression diagnostic signatures and bringing them to clinical practice can be substantially improved and made more reliable by better documentation of the signatures.},
  author = {Kostka, Dennis and Spang, Rainer},
  date = {2008-02},
  doi = {10/fng9px},
  eprint = {18282081},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Kostka and Spang - 2008 - Microarray based diagnosis profits from better doc.pdf},
  issn = {1553-7358},
  journaltitle = {PLoS computational biology},
  keywords = {Algorithms,Diagnosis; Computer-Assisted,Diagnosis; Computer-Assisted: methods,Documentation,Documentation: methods,Gene Expression Profiling,Gene Expression Profiling: methods,Humans,Neoplasm Proteins,Neoplasm Proteins: analysis,Neoplasms,Neoplasms: diagnosis,Neoplasms: metabolism,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,Reproducibility of Results,Sensitivity and Specificity,Tumor Markers; Biological,Tumor Markers; Biological: analysis},
  number = {2},
  pages = {e22},
  title = {Microarray Based Diagnosis Profits from Better Documentation of Gene Expression Signatures.},
  volume = {4}
}

@article{Kowalski2003,
  abstract = {Each year, 55 000 organ transplants are performed worldwide. Cumulatively, the number of living organ recipients is now estimated to be over 300 000. Most of these transplant recipients will remain on immunosuppressive drugs for the remainder of their lives to prevent rejection episodes. Controlled doses of these drugs are required to prevent over-medication, which may leave the patient susceptible to opportunistic infection and drug toxicity effects, or under-dosing, which may lead to shortened graft survival because of rejection episodes. This paper describes the result of a multicenter study conducted at the Universities of Pittsburgh, Alabama and Maryland to evaluate an in vitro assay (Cylex\texttrademark{} Immune Cell Function Assay) for the measurement of global immune response in transplant patients receiving immunosuppressive therapy. The assay uses a whole blood sample to maintain the presence of the drug during incubation. Following overnight incubation of blood with phytohemagglutinin (PHA), CD4 cells are selected using paramagnetic particles coated with a monoclonal antibody to the CD4 epitope. The CD4-positive cells are targeted as major immunosuppressive drugs are designed to specifically inhibit T-cell activation which has been implicated in rejection. The data generated at these three sites were submitted in support of an Food and Drug Association (FDA) application for the use of this assay in the detection of cell-mediated immunity in an immunosuppressed population. The assay was cleared by the FDA on April 2, 2002. This cross-sectional study was designed to establish ranges for reactivity of this bioassay in the assessment of functional immunity for an individual solid organ recipient at any point in time.},
  author = {Kowalski, Richard and Post, Diane and Schneider, Mary C. and Britz, Judith and Thomas, Judy and Deierhoi, Mark and Lobashevsky, Andrew and Redfield, Robert and Schweitzer, Eugene and Heredia, Alonso and Reardon, Elise and Davis, Charles and Bentlejewski, Carol and Fung, John and Shapiro, Ron and Zeevi, Adriana},
  date = {2003},
  doi = {10/btbdwp},
  eprint = {12709071},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Kowalski et al. - 2003 - Immune cell function testing An adjunct to therap.pdf},
  issn = {09020063},
  journaltitle = {Clinical Transplantation},
  keywords = {Functional immunosuppression,Immune cell function testing,Immune monitoring,Immune response,Immunosuppression,Therapeutic drug monitoring,Transplant patient management,Transplant rejection},
  number = {2},
  pages = {77-88},
  title = {Immune Cell Function Testing: {{An}} Adjunct to Therapeutic Drug Monitoring in Transplant Patient Management},
  volume = {17}
}

@article{Kurian2014,
  abstract = {There are no minimally invasive diagnostic metrics for acute kidney transplant rejection (AR), especially in the setting of the common confounding diagnosis, acute dysfunction with no rejection (ADNR). Thus, though kidney transplant biopsies remain the gold standard, they are invasive, have substantial risks, sampling error issues and significant costs and are not suitable for serial monitoring. Global gene expression profiles of 148 peripheral blood samples from transplant patients with excellent function and normal histology (TX; n = 46), AR (n = 63) and ADNR (n = 39), from two independent cohorts were analyzed with DNA microarrays. We applied a new normalization tool, frozen robust multi-array analysis, particularly suitable for clinical diagnostics, multiple prediction tools to discover, refine and validate robust molecular classifiers and we tested a novel one-by-one analysis strategy to model the real clinical application of this test. Multiple three-way classifier tools identified 200 highest value probesets with sensitivity, specificity, positive predictive value, negative predictive value and area under the curve for the validation cohort ranging from 82\% to 100\%, 76\% to 95\%, 76\% to 95\%, 79\% to 100\%, 84\% to 100\% and 0.817 to 0.968, respectively. We conclude that peripheral blood gene expression profiling can be used as a minimally invasive tool to accurately reveal TX, AR and ADNR in the setting of acute kidney transplant dysfunction.},
  author = {Kurian, S M and Williams, a N and Gelbart, T and Campbell, D and Mondala, T S and Head, S R and Horvath, S and Gaber, L and Thompson, R and Whisenant, T and Lin, W and Langfelder, P and Robison, E H and Schaffer, R L and Fisher, J S and Friedewald, J and Flechner, S M and Chan, L K and Wiseman, A C and Shidban, H and Mendez, R and Heilman, R and Abecassis, M M and Marsh, C L and Salomon, D R},
  date = {2014-05},
  doi = {10/f5xswg},
  eprint = {24725967},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Kurian et al. - 2014 - Molecular Classifiers for Acute Kidney Transplant .pdf},
  issn = {16006135},
  journaltitle = {American Journal of Transplantation},
  keywords = {abbreviations,abmr,acute,acute dysfunction with no,antibody-mediated rejection,arrays,gene expression profiling,kidney rejection,micro-,molecular classifiers,rejection},
  number = {5},
  pages = {1164-1172},
  title = {Molecular {{Classifiers}} for {{Acute Kidney Transplant Rejection}} in {{Peripheral Blood}} by {{Whole Genome Gene Expression Profiling}}},
  volume = {14}
}

@article{Kurian2017,
  abstract = {We performed orthogonal technology comparisons of concurrent peripheral blood and biopsy tissue samples from 69 kidney transplant recipients who underwent comprehensive algorithm-driven clinical phenotyping. The sample cohort included patients with normal protocol biopsies and stable transplant (sTx) function (n = 25), subclinical acute rejection (subAR, n = 23), and clinical acute rejection (cAR, n = 21). Comparisons between microarray and RNA sequencing (RNA-seq) signatures were performed and demonstrated a strong correlation between the blood and tissue compartments for both technology platforms. A number of shared differentially expressed genes and pathways between subAR and cAR in both platforms strongly suggest that these two clinical phenotypes form a continuum of alloimmune activation. SubAR is associated with fewer or less expressed genes than cAR in blood, whereas in biopsy tissues, this clinical phenotype demonstrates a more robust molecular signature for both platforms. The discovery work done in this study confirms a clear ability to detect gene expression profiles for sTx, subAR, and cAR in both blood and biopsy tissue, yielding equivalent predictive performance that is agnostic to both technology and platform. Our data also provide strong biological insights into the molecular mechanisms underlying these signatures, underscoring their logistical potential as molecular diagnostics to improve clinical outcomes following kidney transplantation.},
  author = {Kurian, S. M. and Velazquez, E. and Thompson, R. and Whisenant, T. and Rose, S. and Riley, N. and Harrison, F. and Gelbart, T. and Friedewald, J. J. and Charette, J. and Brietigam, S. and Peysakhovich, J. and First, M. R. and Abecassis, M. M. and Salomon, D. R.},
  date = {2017-08},
  doi = {10/gbp6vr},
  file = {/Users/ryan/Documents/Zotero Library/Kurian et al. - 2017 - Orthogonal Comparison of Molecular Signatures of K.pdf},
  issn = {16006135},
  journaltitle = {American Journal of Transplantation},
  keywords = {clinical research/practice,diagnostic techniques and imaging,genomics,kidney (allograft) function/dysfunction,kidney transplantation/nephrology,microarray/gene array,rejection: acute,translational research/science},
  number = {8},
  pages = {2103-2116},
  title = {Orthogonal {{Comparison}} of {{Molecular Signatures}} of {{Kidney Transplants With Subclinical}} and {{Clinical Acute Rejection}}: {{Equivalent Performance Is Agnostic}} to {{Both Technology}} and {{Platform}}},
  volume = {17}
}

@article{Kuwano2009,
  abstract = {The RNA-binding protein nuclear factor 90 (NF90) has been implicated in the stabilization, transport and translational control of several target mRNAs. However, a systematic analysis of NF90 target mRNAs has not been performed. Here, we use ribonucleoprotein immunoprecipitation analysis to identify a large subset of NF90-associated mRNAs. Comparison of the 3'-untranslated regions (UTRs) of these mRNAs led to the elucidation of a 25- to 30-nucleotide, RNA signature motif rich in adenines and uracils. Insertion of the AU-rich NF90 motif ('NF90m') in the 3'UTR of an EGFP heterologous reporter did not affect the steady-state level of the chimeric EGFP-NF90m mRNA or its cytosolic abundance. Instead, the translation of EGFP-NF90m mRNA was specifically repressed in an NF90-dependent manner, as determined by analysing nascent EGFP translation, the distribution of chimeric mRNAs on polysome gradients and the steady-state levels of expressed EGFP protein. The interaction of endogenous NF90 with target mRNAs was validated after testing both endogenous mRNAs and recombinant biotinylated transcripts containing NF90 motif hits. Further analysis showed that the stability of endogenous NF90 target mRNAs was not significantly influenced by NF90 abundance, while their translation increased when NF90 levels were reduced. In summary, we have identified an AU-rich RNA motif present in NF90 target mRNAs and have obtained evidence that NF90 represses the translation of this subset of mRNAs.},
  author = {Kuwano, Yuki and Pullmann, Rudolf and Marasa, Bernard S. and Abdelmohsen, Kotb and Lee, Eun Kyung and Yang, Xiaoling and Martindale, Jennifer L. and Zhan, Ming and Gorospe, Myriam},
  date = {2009},
  doi = {10/fwnmcz},
  eprint = {19850717},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Kuwano et al. - 2009 - NF90 selectively represses the translation of targ.pdf},
  isbn = {1362-4962 (Electronic)\textbackslash{}n0305-1048 (Linking)},
  issn = {03051048},
  journaltitle = {Nucleic Acids Research},
  number = {1},
  pages = {225-238},
  title = {{{NF90}} Selectively Represses the Translation of Target {{mRNAs}} Bearing an {{AU}}-Rich Signature Motif},
  volume = {38}
}

@article{Kvam2012,
  abstract = {RNA-Seq technologies are quickly revolutionizing genomic studies, and statistical methods for RNA-seq data are under continuous development. Timely review and comparison of the most recently proposed statistical methods will provide a useful guide for choosing among them for data analysis. Particular interest surrounds the ability to detect differential expression (DE) in genes. Here we compare four recently proposed statistical methods, edgeR, DESeq, baySeq, and a method with a two-stage Poisson model (TSPM), through a variety of simulations that were based on different distribution models or real data. We compared the ability of these methods to detect DE genes in terms of the significance ranking of genes and false discovery rate control. All methods compared are implemented in freely available software. We also discuss the availability and functions of the currently available versions of these software.},
  author = {Kvam, Vanessa M and Liu, Peng and Si, Yaqing},
  date = {2012-02},
  doi = {10/fxxn93},
  eprint = {22268221},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Kvam et al. - 2012 - A comparison of statistical methods for detecting .pdf},
  issn = {1537-2197},
  journaltitle = {American journal of botany},
  number = {2},
  pages = {248-56},
  title = {A Comparison of Statistical Methods for Detecting Differentially Expressed Genes from {{RNA}}-Seq Data.},
  volume = {99}
}

@article{Lahens2014,
  abstract = {BACKGROUND: RNA-seq is a powerful technique for identifying and quantifying transcription and splicing events, both known and novel. However, given its recent development and the proliferation of library construction methods, understanding the bias it introduces is incomplete but critical to realizing its value.\textbackslash{}n\textbackslash{}nRESULTS: We present a method, in vitro transcription sequencing (IVT-seq), for identifying and assessing the technical biases in RNA-seq library generation and sequencing at scale. We created a pool of over 1,000 in vitro transcribed RNAs from a full-length human cDNA library and sequenced them with polyA and total RNA-seq, the most common protocols. Because each cDNA is full length, and we show in vitro transcription is incredibly processive, each base in each transcript should be equivalently represented. However, with common RNA-seq applications and platforms, we find 50\% of transcripts have more than two-fold and 10\% have more than 10-fold differences in within-transcript sequence coverage. We also find greater than 6\% of transcripts have regions of dramatically unpredictable sequencing coverage between samples, confounding accurate determination of their expression. We use a combination of experimental and computational approaches to show rRNA depletion is responsible for the most significant variability in coverage, and several sequence determinants also strongly influence representation.\textbackslash{}n\textbackslash{}nCONCLUSIONS: These results show the utility of IVT-seq for promoting better understanding of bias introduced by RNA-seq. We find rRNA depletion is responsible for substantial, unappreciated biases in coverage introduced during library preparation. These biases suggest exon-level expression analysis may be inadvisable, and we recommend caution when interpreting RNA-seq results.},
  author = {Lahens, Nicholas F and Kavakli, Ibrahim Halil and Zhang, Ray and Hayer, Katharina and Black, Michael B and Dueck, Hannah and Pizarro, Angel and Kim, Junhyong and Irizarry, Rafael and Thomas, Russell S and Grant, Gregory R and Hogenesch, John B},
  date = {2014},
  doi = {10/ggcht2},
  eprint = {24981968},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Lahens et al. - 2014 - IVT-seq reveals extreme bias in RNA sequencing..pdf},
  isbn = {1465-6906},
  issn = {1474-760X},
  journaltitle = {Genome biology},
  keywords = {Animals,Artifacts,Base Composition,Base Sequence,Gene Library,Genetic,Humans,In Vitro Techniques,Inbred C57BL,Male,Mice,Nucleic Acid,Ribosomal,Ribosomal: genetics,RNA,Sequence Analysis,Sequence Homology,Transcription},
  number = {6},
  pages = {R86},
  title = {{{IVT}}-Seq Reveals Extreme Bias in {{RNA}} Sequencing.},
  volume = {15}
}

@thesis{LaMere2015,
  abstract = {CD4 T cells undergo activation and differentiation into various cellular subtypes in response to antigen. Memory T cells are known to be primed for rapid responses, but the epigenetic influences upon the process of activation and the formation of memory cells are still poorly defined. Two major epigenetic mechanisms influencing the regulation of gene expression include CpG methylation and histone modifications. We developed a novel method to analyze CpG methylation in order to interrogate the CpG methylation status of 2100 gene promoters in na\"ive and memory CD4 T cells. From these data, we demonstrate that CpG methylation profiling of a relatively small gene set can distinguish memory from na\"ive subsets. Additionally, we identify a class of primed genes in memory cells that are putatively regulated by CpG methylation, many of which have not previously been studied in T cells. In addition to our CpG methylation studies, we have profiled 3 histone modifications in na\"ive and memory CD4 T cells during activation. H3K4 and H3K27 methylation are frequently studied in the context of their association with gene expression, but their function during CD4 T cell activation has not been determined. Using ChIPseq for H3K4me2, H3K4me3 and H3K27me3 alongside RNAseq in na\"ive and memory human CD4 T cells at rest and after activation, we have defined the roles these modifications are playing throughout the process of activation and linked them back to regulation of key pathways in T cell activation and differentiation. Our results demonstrate that promoter H3K4 methylation provides a feed-forward mechanism for upregulating RNA expression during activation, while changes to promoter H3K27me3 after activation reinforce baseline expression at rest. H3K27me3 demethylation is a prominent finding in both na\"ive and memory cells early in activation, and inhibiting this demethylation leads to proliferation defects, blunted CD25 upregulation, cytokine perturbations, and cell cycle disruptions in na\"ive CD4 T cells, demonstrating that H3K27me3 demethylation is integral to CD4 T cell activation. Our results enhance our understanding of the role these epigenetic modifications play during CD4 T cell activation and underscore key differences between na\"ive and memory cells in their activation dynamics.},
  author = {LaMere, Sarah Adrianne Hutchison},
  date = {2015},
  file = {/Users/ryan/Documents/Zotero Library/LaMere - 2015 - Dynamic epigenetic regulation of CD4 T cell activa.pdf},
  institution = {{The Scripps Research Institute}},
  pagetotal = {371},
  title = {Dynamic Epigenetic Regulation of {{CD4 T}} Cell Activation and Memory Formation}
}

@article{LaMere2016,
  abstract = {The epigenetic determinants driving the responses of CD4 T cells to antigen are currently an area of active research. Much has been done to characterize helper T-cell subsets and their associated genome-wide epigenetic patterns. In contrast, little is known about the dynamics of histone modifications during CD4 T-cell activation and the differential kinetics of these epigenetic marks between naive and memory T cells. In this study, we have detailed the dynamics of genome-wide promoter H3K4me2 and H3K4me3 over a time course during activation of human naive and memory CD4 T cells. Our results demonstrate that changes to H3K4 methylation occur relatively late after activation (5 days) and reinforce activation-induced upregulation of gene expression, affecting multiple pathways important to T-cell activation, differentiation and function. The dynamics and mapped pathways of H3K4 methylation are distinctly different in memory cells, which have substantially more promoters marked by H3K4me3 alone, reinforcing their more differentiated state. Our study provides the first data examining genome-wide histone modification dynamics during CD4 T-cell activation, providing insight into the cross talk between H3K4 methylation and gene expression, and underscoring the impact of these marks upon key pathways integral to CD4 T-cell activation and function.},
  author = {LaMere, S. A. and Thompson, R. C. and Komori, H. K. and Mark, A. and Salomon, D. R.},
  date = {2016-07-12},
  doi = {10/f97x85},
  eprint = {27170561},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Mendeley Desktop/LaMere et al. - 2016 - Promoter H3K4 methylation dynamically reinforces a.ppt;/Users/ryan/Documents/Zotero Library/LaMere et al. - 2016 - Promoter H3K4 methylation dynamically reinforces a.pdf},
  issn = {1466-4879},
  journaltitle = {Genes \& Immunity},
  number = {5},
  pages = {283-297},
  title = {Promoter {{H3K4}} Methylation Dynamically Reinforces Activation-Induced Pathways in Human {{CD4 T}} Cells},
  volume = {17}
}

@article{LaMere2017,
  abstract = {The changes to the epigenetic landscape in response to Ag during CD4 T cell activation have not been well characterized. Although CD4 T cell subsets have been mapped globally for numerous epigenetic marks, little has been done to study their dynamics early after activation. We have studied changes to promoter H3K27me3 during activation of human naive and memory CD4 T cells. Our results show that these changes occur relatively early (1 d) after activation of naive and memory cells and that demethylation is the predominant change to H3K27me3 at this time point, reinforcing high expression of target genes. Additionally, inhibition of the H3K27 demethylase JMJD3 in naive CD4 T cells demonstrates how critically important molecules required for T cell differentiation, such as JAK2 and IL12RB2, are regulated by H3K27me3. Our results show that H3K27me3 is a dynamic and important epigenetic modification during CD4 T cell activation and that JMJD3-driven H3K27 demethylation is critical for CD4 T cell function.},
  author = {LaMere, Sarah A. and Thompson, Ryan C. and Meng, Xiangzhi and Komori, H. Kiyomi and Mark, Adam and Salomon, Daniel R.},
  date = {2017-11-01},
  doi = {10/gchc9x},
  eprint = {28947543},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/LaMere et al. - 2017 - H3K27 Methylation Dynamics during CD4 T Cell Activ.pdf},
  issn = {0022-1767},
  journaltitle = {The Journal of Immunology},
  keywords = {development,epigenetics,histone demethylation,kdm 4},
  number = {9},
  pages = {3158-3175},
  title = {{{H3K27 Methylation Dynamics}} during {{CD4 T Cell Activation}}: {{Regulation}} of {{JAK}}/{{STAT}} and {{IL12RB2 Expression}} by {{JMJD3}}},
  volume = {199}
}

@article{Landau2013,
  abstract = {A central goal of RNA sequencing (RNA-seq) experiments is to detect differentially expressed genes. In the ubiquitous negative binomial model for RNA-seq data, each gene is given a dispersion parameter, and correctly estimating these dispersion parameters is vital to detecting differential expression. Since the dispersions control the variances of the gene counts, underestimation may lead to false discovery, while overestimation may lower the rate of true detection. After briefly reviewing several popular dispersion estimation methods, this article describes a simulation study that compares them in terms of point estimation and the effect on the performance of tests for differential expression. The methods that maximize the test performance are the ones that use a moderate degree of dispersion shrinkage: the DSS, Tagwise wqCML, and Tagwise APL. In practical RNA-seq data analysis, we recommend using one of these moderate-shrinkage methods with the QLShrink test in QuasiSeq R package.},
  author = {Landau, William Michael and Liu, Peng},
  date = {2013},
  doi = {10/ggcxkt},
  eprint = {24349066},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Landau and Liu - 2013 - Dispersion estimation and its effect on test perfo.pdf},
  issn = {1932-6203},
  journaltitle = {PloS one},
  keywords = {Gene Expression Profiling,Sequence Analysis; RNA,Sequence Analysis; RNA: methods},
  number = {12},
  pages = {e81415},
  title = {Dispersion Estimation and Its Effect on Test Performance in {{RNA}}-Seq Data Analysis: A Simulation-Based Comparison of Methods.},
  volume = {8}
}

@article{Langaas2005,
  abstract = {We consider the problem of estimating the proportion of true null hypotheses, {$\pi$}0,in a multiple-hypothesis set-up. The tests are based on observed p-values.We first review pub- lished estimators based on the estimator that was suggested by Schweder and Spj\o{}tvoll.Then we derive newestimators based on nonparametric maximumlikelihood estimation of thep-value density, restricting to decreasing and convex decreasing densities.The estimators of {$\pi$}0 are all derived under the assumption of independent test statistics. Their performance under depen- dence is investigated in a simulation study.We find that the estimators are relatively robust with respect to the assumption of independence and work well also for test statistics with moderate dependence.},
  author = {Langaas, Mette and Lindqvist, Bo Henry and Ferkingstad, Egil},
  date = {2005},
  doi = {10/bnrssw},
  file = {/Users/ryan/Documents/Zotero Library/Langaas et al. - 2005 - Estimating the proportion of true null hypotheses,.pdf},
  issn = {13697412},
  journaltitle = {Journal of the Royal Statistical Society. Series B: Statistical Methodology},
  keywords = {Bioinformatics,Decreasing and convex density,Dependent test statistics,Multiple testing,Nonparametric maximum likelihood estimator,p-value},
  number = {4},
  pages = {555-572},
  title = {Estimating the Proportion of True Null Hypotheses, with Application to {{DNA}} Microarray Data},
  volume = {67}
}

@article{Langenberger2009a,
  abstract = {MicroRNA-offset-RNAs (moRNAs) were recently detected as highly abundant class of small RNAs in a basal chordate. Using short read sequencing data, we show here that moRNAs are also produced from human microRNA precursors, albeit at quite low expression levels. The expression levels of moRNAs are unrelated to those of the associated microRNAs. Surprisingly, microRNA precursors that also show moRNAs are typically evolutionarily old, comprising more than half of the microRNA families that were present in early Bilateria, while evidence for moRNAs was found only for a relative small fraction of microRNA families of recent origin.},
  author = {Langenberger, David and Bermudez-Santana, Clara and Hertel, Jana and Hoffmann, Steve and Khaitovich, Philipp and Stadler, Peter F},
  date = {2009-09-15},
  doi = {10/frf9mz},
  eprint = {19584066},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Langenberger et al. - 2009 - Evidence for human microRNA-offset RNAs in small R.pdf},
  issn = {1367-4803},
  journaltitle = {Bioinformatics},
  keywords = {Humans,MicroRNAs,MicroRNAs: chemistry,RNA,RNA: chemistry,Sequence Analysis,Small Interfering,Small Interfering: chemistry},
  number = {18},
  pages = {2298-2301},
  title = {Evidence for Human {{microRNA}}-Offset {{RNAs}} in Small {{RNA}} Sequencing Data},
  volume = {25}
}

@article{Langmead2012,
  abstract = {As the rate of sequencing increases, greater throughput is demanded from read aligners. The full-text minute index is often used to make alignment very fast and memory-efficient, but the approach is ill-suited to finding longer, gapped alignments. Bowtie 2 combines the strengths of the full-text minute index with the flexibility and speed of hardware-accelerated dynamic programming algorithms to achieve a combination of high speed, sensitivity and accuracy.},
  author = {Langmead, Ben and Salzberg, Steven L},
  date = {2012-04},
  doi = {10/gd2xzn},
  eprint = {22388286},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Langmead and Salzberg - 2012 - Fast gapped-read alignment with Bowtie 2..pdf},
  issn = {1548-7105},
  journaltitle = {Nature methods},
  keywords = {Algorithms,Computational Biology,Computational Biology: methods,Databases; Genetic,Genome; Human,Genome; Human: genetics,Humans,Sequence Alignment,Sequence Alignment: methods,Sequence Analysis; DNA,Sequence Analysis; DNA: methods},
  number = {4},
  pages = {357-9},
  title = {Fast Gapped-Read Alignment with {{Bowtie}} 2.},
  volume = {9}
}

@article{Langsrud2005,
  author = {Langsrud, \O{}},
  date = {2005},
  doi = {10/b986bm},
  file = {/Users/ryan/Documents/Zotero Library/Langsrud - 2005 - Rotation tests.pdf},
  journaltitle = {Statistics and computing},
  keywords = {adjusted p-value,conditional inference,microarray data analysis,multiple endpoints,multiple testing,random orthogonal matrix,spherical distribution},
  number = {1975},
  pages = {53-60},
  title = {Rotation Tests}
}

@article{Lau2006,
  abstract = {Small noncoding RNAs regulate processes essential for cell growth and development, including mRNA degradation, translational repression, and transcriptional gene silencing (TGS). During a search for candidate mammalian factors for TGS, we purified a complex that contains small RNAs and Riwi, the rat homolog to human Piwi. The RNAs, frequently 29 to 30 nucleotides in length, are called Piwi-interacting RNAs (piRNAs), 94\% of which map to 100 defined ({$<$} or = 101 kb) genomic regions. Within these regions, the piRNAs generally distribute across only one genomic strand or distribute on two strands but in a divergent, nonoverlapping manner. Preparations of piRNA complex (piRC) contain rRecQ1, which is homologous to qde-3 from Neurospora, a gene implicated in silencing pathways. Piwi has been genetically linked to TGS in flies, and slicer activity cofractionates with the purified complex. These results are consistent with a gene-silencing role for piRC in mammals.},
  author = {Lau, Nelson C and Seto, Anita G and Kim, Jinkuk and Kuramochi-Miyagawa, Satomi and Nakano, Toru and Bartel, David P and Kingston, Robert E},
  date = {2006-07-21},
  doi = {10/c3rxbv},
  eprint = {16778019},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Lau et al. - 2006 - Characterization of the piRNA complex from rat tes.pdf},
  issn = {1095-9203},
  journaltitle = {Science (New York, N.Y.)},
  keywords = {Adenosine Triphosphatases,Adenosine Triphosphatases: isolation & purificatio,Adenosine Triphosphatases: metabolism,Animals,Chromosomes; Mammalian,Conserved Sequence,DNA Helicases,DNA Helicases: isolation & purification,DNA Helicases: metabolism,Gene Library,Genome,Male,Mice,Proteins,Proteins: isolation & purification,Proteins: metabolism,Rats,Rats; Sprague-Dawley,RecQ Helicases,Ribonucleoproteins,Ribonucleoproteins: chemistry,Ribonucleoproteins: isolation & purification,Ribonucleoproteins: metabolism,RNA Interference,RNA; Untranslated,RNA; Untranslated: chemistry,RNA; Untranslated: genetics,RNA; Untranslated: isolation & purification,RNA; Untranslated: metabolism,Testis,Testis: chemistry,Transcription; Genetic},
  number = {5785},
  pages = {363-7},
  title = {Characterization of the {{piRNA}} Complex from Rat Testes.},
  volume = {313}
}

@article{Lau2018,
  abstract = {Clonal expansion and immunological memory are hallmark features of the mammalian adaptive immune response and essential for prolonged host control of pathogens. Recent work demonstrates that natural killer (NK) cells of the innate immune system also exhibit these adaptive traits during infection. Here we demonstrate that differentiating and `memory' NK cells possess distinct chromatin accessibility states and that their epigenetic profiles reveal a `poised' regulatory program at the memory stage. Furthermore, we elucidate how individual STAT transcription factors differentially control epigenetic and transcriptional states early during infection. Finally, concurrent chromatin profiling of the canonical CD8+ T cell response against the same infection demonstrated parallel and distinct epigenetic signatures defining NK cells and CD8+ T cells. Overall, our study reveals the dynamic nature of epigenetic modifications during the generation of innate and adaptive lymphocyte memory.},
  author = {Lau, Colleen M. and Adams, Nicholas M. and Geary, Clair D. and Weizman, Orr El and Rapp, Moritz and Pritykin, Yuri and Leslie, Christina S. and Sun, Joseph C.},
  date = {2018},
  doi = {10/gd46z7},
  file = {/Users/ryan/Documents/Zotero Library/Lau et al. - 2018 - Epigenetic control of innate and adaptive immune m.pdf},
  issn = {15292916},
  journaltitle = {Nature Immunology},
  number = {9},
  pages = {963-972},
  title = {Epigenetic Control of Innate and Adaptive Immune Memory},
  volume = {19}
}

@article{Lawrence2013,
  author = {Lawrence, Michael and Huber, Wolfgang and Pag\`es, Herv\'e and Aboyoun, Patrick and Carlson, Marc and Gentleman, Robert and Morgan, Martin T. and Carey, Vincent J.},
  date = {2013-08-08},
  doi = {10/f5cmfg},
  editor = {Prlic, Andreas},
  file = {/Users/ryan/Documents/Zotero Library/Lawrence et al. - 2013 - Software for Computing and Annotating Genomic Rang.pdf},
  issn = {1553-7358},
  journaltitle = {PLoS Computational Biology},
  number = {8},
  pages = {e1003118},
  title = {Software for {{Computing}} and {{Annotating Genomic Ranges}}},
  volume = {9}
}

@article{lawVoomPrecisionWeights2014,
  abstract = {New normal linear modeling strategies are presented for analyzing read counts from RNA-seq experiments. The voom method estimates the mean-variance relationship of the log-counts, generates a precision weight for each observation and enters these into the limma empirical Bayes analysis pipeline. This opens access for RNA-seq analysts to a large body of methodology developed for microarrays. Simulation studies show that voom performs as well or better than count-based RNA-seq methods even when the data are generated according to the assumptions of the earlier methods. Two case studies illustrate the use of linear modeling and gene set testing methods. \textcopyright{} 2014 Law et al.; licensee BioMed Central Ltd.},
  author = {Law, Charity W. and Chen, Yunshun and Shi, Wei and Smyth, Gordon K.},
  date = {2014},
  doi = {10/gfghcz},
  eprint = {24485249},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Law et al. - 2014 - voom precision weights unlock linear model analys.pdf},
  ids = {Law2014},
  issn = {1465-6906},
  journaltitle = {Genome Biology},
  number = {2},
  pages = {R29},
  title = {Voom: Precision Weights Unlock Linear Model Analysis Tools for {{RNA}}-Seq Read Counts},
  volume = {15}
}

@article{LeBlanc2003,
  abstract = {Mesenchymal stem cells (MSC) derived from adult BM or fetal liver form several mesenchymal tissues after appropriate stimulation. Reports indicate that MSC have unique immunologic properties, making them ideal for cellular therapy. MSC are not immunogenic, they do not stimulate alloreactivity, and they escape lysis by cytotoxic T-cells and natural killer (NK)-cells. Thus, MSC may be transplantable between HLA-mismatched individuals without the need for host immunosuppression. Furthermore, adult MSC appear to be immunosuppressive as they reduce alloreactivity and the formation of cytotoxic lymphocytes in vitro. In vivo, adult MSC prolong the time to rejection of mis-matched skin grafts in baboons. The immunosuppressive properties of first trimester fetal MSC are less pronounced, but inducible with IFN?. These findings imply a potential role for MSC, not only in the repair of damaged tissues, but also in the manipulation of immune responses.},
  author = {Le Blanc, K},
  date = {2003-12-01},
  doi = {10/dmxgm2},
  issn = {1465-3249},
  journaltitle = {Cytotherapy},
  keywords = {\#nosource},
  number = {6},
  pages = {485-489},
  title = {Immunomodulatory Effects of Fetal and Adult Mesenchymal Stem Cells},
  volume = {5}
}

@article{Leek2007,
  abstract = {It has unambiguously been shown that genetic, environmental, demographic, and technical factors may have substantial effects on gene expression levels. In addition to the measured variable(s) of interest, there will tend to be sources of signal due to factors that are unknown, unmeasured, or too complicated to capture through simple models. We show that failing to incorporate these sources of heterogeneity into an analysis can have widespread and detrimental effects on the study. Not only can this reduce power or induce unwanted dependence across genes, but it can also introduce sources of spurious signal to many genes. This phenomenon is true even for well-designed, randomized studies. We introduce "surrogate variable analysis" (SVA) to overcome the problems caused by heterogeneity in expression studies. SVA can be applied in conjunction with standard analysis techniques to accurately capture the relationship between expression and any modeled variables of interest. We apply SVA to disease class, time course, and genetics of gene expression studies. We show that SVA increases the biological accuracy and reproducibility of analyses in genome-wide expression studies.},
  author = {Leek, Jeffrey T. and Storey, John D.},
  date = {2007-09},
  doi = {10/c9pc69},
  eprint = {17907809},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Leek and Storey - 2007 - Capturing heterogeneity in gene expression studies.pdf},
  isbn = {1553-7404 (Electronic)\textbackslash{}n1553-7390 (Linking)},
  issn = {15537390},
  journaltitle = {PLoS Genetics},
  keywords = {Algorithms,Breast Neoplasms,Breast Neoplasms: genetics,Computer Simulation,Data Interpretation; Statistical,Epigenesis; Genetic,Female,Gene Expression,Genes; BRCA1,Genes; BRCA2,Genetic Heterogeneity,Genetic Linkage,Genome; Fungal,Genome; Human,Humans,Kidney,Kidney: metabolism,Linear Models,Mutation,Oligonucleotide Array Sequence Analysis,Quantitative Trait; Heritable,Reproducibility of Results,Saccharomyces cerevisiae,Saccharomyces cerevisiae: genetics,Saccharomyces cerevisiae: metabolism,Time Factors},
  number = {9},
  pages = {1724-1735},
  title = {Capturing Heterogeneity in Gene Expression Studies by Surrogate Variable Analysis},
  volume = {3}
}

@article{Leek2010,
  abstract = {High-throughput technologies are widely used, for example to assay genetic variants, gene and protein expression, and epigenetic modifications. One often overlooked complication with such studies is batch effects, which occur because measurements are affected by laboratory conditions, reagent lots and personnel differences. This becomes a major problem when batch effects are correlated with an outcome of interest and lead to incorrect conclusions. Using both published studies and our own analyses, we argue that batch effects (as well as other technical and biological artefacts) are widespread and critical to address. We review experimental and computational approaches for doing so.},
  author = {Leek, JT and Scharpf, RB and Bravo, HC},
  date = {2010},
  doi = {10/cfr324},
  eprint = {20838408},
  eprinttype = {pubmed},
  file = {/Users/ryan/Documents/Zotero Library/Leek et al. - 2010 - Tackling the widespread and critical impact of bat.pdf},
  issn = {1471-0056},
  journaltitle = {Nature Reviews \ldots{}},
  number = {10},
  pages = {733-739},
  title = {Tackling the Widespread and Critical Impact of Batch Effects in High-Throughput Data},
  volume = {11}
}

@article{Leek2014,
  abstract = {It is now known that unwanted noise and unmodeled artifacts such as batch effects can dramatically reduce the accuracy of statistical inference in genomic experiments. These sources of noise must be modeled and removed to accurately measure biological variability and to obtain correct statistical inference when performing high-throughput genomic analysis. We introduced surrogate variable analysis (sva) for estimating these artifacts by (i) identifying the part of the genomic data only affected by artifacts and (ii) estimating the artifacts with principal components or singular vectors of the subset of the data matrix. The resulting estimates of artifacts can be used in subsequent analyses as adjustment factors to correct analyses. Here I describe a version of the sva approach specifically created for count data or FPKMs from sequencing experiments based on appropriate data transformation. I also describe the addition of supervised sva (ssva) for using control probes to identify the part of the genomic data only affected by artifacts. I present a comparison between these versions of sva and other methods for batch effect estimation on simulated data, real count-based data and FPKM-based data. These updates are available through the sva Bioconductor package and I have made fully reproducible analysis using these methods available from: https://github.com/jtleek/svaseq.},
  author = {Leek, Jeffrey T},
  date = {2014-12-01},
  doi = {10/f8k8kf},
  eprint = {25294822},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Leek - 2014 - svaseq removing batch effects and other unwanted .pdf},
  issn = {1362-4962},
  journaltitle = {Nucleic acids research},
  number = {21},
  pages = {0-11},
  title = {Svaseq: Removing Batch Effects and Other Unwanted Noise from Sequencing Data.},
  volume = {42}
}

@article{Legault2013,
  abstract = {MOTIVATION: Alternative splicing and other processes that allow for different transcripts to be derived from the same gene are significant forces in the eukaryotic cell. RNA-Seq is a promising technology for analyzing alternative transcripts, as it does not require prior knowledge of transcript structures or genome sequences. However, analysis of RNA-Seq data in the presence of genes with large numbers of alternative transcripts is currently challenging due to efficiency, identifiability and representation issues.

RESULTS: We present RNA-Seq models and associated inference algorithms based on the concept of probabilistic splice graphs, which alleviate these issues. We prove that our models are often identifiable and demonstrate that our inference methods for quantification and differential processing detection are efficient and accurate.

AVAILABILITY: Software implementing our methods is available at http://deweylab.biostat.wisc.edu/psginfer.

CONTACT: cdewey@biostat.wisc.edu SUPPLEMENTARY INFORMATION: Supplementary data are available at Bioinformatics online.},
  author = {Legault, Laura H and Dewey, Colin N},
  date = {2013-08-02},
  doi = {10/f48wb2},
  eprint = {23846746},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Legault and Dewey - 2013 - Inference of alternative splicing from RNA-Seq dat.pdf},
  issn = {1367-4811},
  journaltitle = {Bioinformatics (Oxford, England)},
  number = {18},
  pages = {2300-2310},
  title = {Inference of Alternative Splicing from {{RNA}}-{{Seq}} Data with Probabilistic Splice Graphs.},
  volume = {29}
}

@article{Leinonen2011,
  abstract = {The combination of significantly lower cost and increased speed of sequencing has resulted in an explosive growth of data submitted into the primary next-generation sequence data archive, the Sequence Read Archive (SRA). The preservation of experimental data is an important part of the scientific record, and increasing numbers of journals and funding agencies require that next-generation sequence data are deposited into the SRA. The SRA was established as a public repository for the next-generation sequence data and is operated by the International Nucleotide Sequence Database Collaboration (INSDC). INSDC partners include the National Center for Biotechnology Information (NCBI), the European Bioinformatics Institute (EBI) and the DNA Data Bank of Japan (DDBJ). The SRA is accessible at http://www.ncbi.nlm.nih.gov/ Traces/sra from NCBI, at http://www.ebi.ac.uk/ena from EBI and at http://trace.ddbj.nig.ac.jp from DDBJ. In this article, we present the content and structure of the SRA, detail our support for sequencing platforms and provide recommended data submission levels and formats. We also briefly outline our response to the challenge of data growth. \textcopyright{} The Author(s) 2010.},
  author = {Leinonen, Rasko and Sugawara, Hideaki and Shumway, Martin},
  date = {2011},
  doi = {10/c652z5},
  file = {/Users/ryan/Documents/Zotero Library/Leinonen et al. - 2011 - The sequence read archive.pdf},
  issn = {03051048},
  issue = {SUPPL. 1},
  journaltitle = {Nucleic Acids Research},
  pages = {2010-2012},
  title = {The Sequence Read Archive},
  volume = {39}
}

@article{Leleu2010,
  abstract = {Chromatin-immunoprecipitation and sequencing (ChIP-seq) is a rapidly maturing technology that draws on the power of high-throughput short-read sequencing to decipher chromatin states with unprecedented precision and breadth. Although some aspects of the experimental protocol require careful tuning, the bottleneck currently firmly lies with the downstream data analysis. We give an overview of the better-established aspects of genome mapping and data normalization and we describe the more recent progress in peak calling and their statistical analysis and provide a brief overview of popular follow-up analyses such as genomic feature categorization and motif search. \textcopyright{} The Author 2010. Published by Oxford University Press. All rights reserved.},
  author = {Leleu, Marion and Lefebvre, Gr\'egory and Rougemont, Jacques},
  date = {2010-12-01},
  doi = {10/bh6hm6},
  eprint = {20861161},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Leleu et al. - 2010 - Processing and analyzing ChIP-seq data from short.pdf},
  isbn = {2041-2657 (Electronic)\textbackslash{}r2041-2649 (Linking)},
  issn = {2041-2649},
  journaltitle = {Briefings in Functional Genomics},
  keywords = {Base Sequence,Bioinformatics,Chip-seq,Chromatin,Chromatin Immunoprecipitation,Chromatin Immunoprecipitation: methods,Chromatin: metabolism,Chromosome Mapping,DNA,DNA binding,DNA: methods,Genetic,Genome,High-Throughput Nucleotide Sequencing,High-Throughput Nucleotide Sequencing: methods,High-throughput sequencing,Humans,Saccharomyces cerevisiae,Saccharomyces cerevisiae: genetics,Sequence Analysis,Transcription,Transcriptional regulation},
  number = {5-6},
  pages = {466-476},
  title = {Processing and Analyzing {{ChIP}}-Seq Data: From Short Reads to Regulatory Interactions},
  volume = {9}
}

@report{Leng2013,
  abstract = {Messenger RNA expression is important in normal development and differentiation, as well as in manifestation of disease. High-throughput cDNA sequencing (RNA-seq) experiments allow for the identification of differentially expressed (DE) genes and their corresponding isoforms on a genome-wide scale. However, statistical methods are required to ensure that accurate identifications are made. A number of methods have been developed for identifying DE genes in an RNA-seq experiment, but they are deficient for identifying DE isoforms. Because uncertainty in estimated isoform expression varies directly with isoform complexity, applications of gene-centric ap- proaches to isoform inference results in reduced power for some classes of isoforms and increased false discoveries for others. In addition, the most popular gene-centric DE methods are not robust to outliers, which further increases the potential for false discoveries in both gene and isoform-level inference. We have developed an empirical Bayesian modeling approach for identifying differential expression in an RNA-seq experiment (EBSeq) comparing two or more biological conditions. Evaluation via simulation and case studies demonstrates that EBSeq is a powerful and robust ap- proach that outperforms existing methods. Application of EBSeq to a study of human embryonic and induced pluripotent stem cells provides novel insights into genomic differences underlying these cell types and illustrates the importance of appropriate statistical analyses.},
  author = {Leng, Ning and Dawson, JA and Thomson, James and {Victor Ruotti} and {Anna Rissman} and {Bart Smits} and {Jill Haag} and {Michael Gould} and {Ron Stewart} and {Christina Kendziorski}},
  date = {2012-03},
  file = {/Users/ryan/Documents/Zotero Library/Leng et al. - 2012 - EBSeq An empirical Bayes hierarchical model for i.pdf},
  institution = {{University of Wisconsin}},
  keywords = {⛔ No DOI found},
  number = {226},
  title = {{{EBSeq}}: {{An}} Empirical {{Bayes}} Hierarchical Model for Inference in {{RNA}}-Seq Experiments},
  url = {http://bioinformatics.oxfordjournals.org/content/early/2013/02/21/bioinformatics.btt087.short},
  urldate = {2013-06-20}
}

@article{Lewin2007,
  author = {Lewin, Alex and Bochkina, Natalia and Richardson, Sylvia},
  date = {2007-01-21},
  doi = {10/fkh58c},
  file = {/Users/ryan/Documents/Zotero Library/Lewin et al. - 2007 - Fully Bayesian Mixture Model for Differential Gene.pdf},
  issn = {1544-6115},
  journaltitle = {Statistical Applications in Genetics and Molecular Biology},
  number = {1},
  title = {Fully {{Bayesian Mixture Model}} for {{Differential Gene Expression}}: {{Simulations}} and {{Model Checks}}},
  volume = {6}
}

@article{Li2001,
  abstract = {BACKGROUND A model-based analysis of oligonucleotide expression arrays we developed previously uses a probe-sensitivity index to capture the response characteristic of a specific probe pair and calculates model-based expression indexes (MBEI). MBEI has standard error attached to it as a measure of accuracy. Here we investigate the stability of the probe-sensitivity index across different tissue types, the reproducibility of results in replicate experiments, and the use of MBEI in perfect match (PM)-only arrays. RESULTS Probe-sensitivity indexes are stable across tissue types. The target gene's presence in many arrays of an array set allows the probe-sensitivity index to be estimated accurately. We extended the model to obtain expression values for PM-only arrays, and found that the 20-probe PM-only model is comparable to the 10-probe PM/MM difference model, in terms of the expression correlations with the original 20-probe PM/MM difference model. MBEI method is able to extend the reliable detection limit of expression to a lower mRNA concentration. The standard errors of MBEI can be used to construct confidence intervals of fold changes, and the lower confidence bound of fold change is a better ranking statistic for filtering genes. We can assign reliability indexes for genes in a specific cluster of interest in hierarchical clustering by resampling clustering trees. A software dChip implementing many of these analysis methods is made available. CONCLUSIONS The model-based approach reduces the variability of low expression estimates, and provides a natural method of calculating expression values for PM-only arrays. The standard errors attached to expression values can be used to assess the reliability of downstream analysis.},
  author = {Li, C and Hung Wong, W},
  date = {2001-01},
  doi = {10/dtsm65},
  eprint = {11532216},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Li and Hung Wong - 2001 - Model-based analysis of oligonucleotide arrays mo.pdf},
  issn = {1474-760X},
  journaltitle = {Genome biology},
  keywords = {Cluster Analysis,DNA Probes,Gene Expression Profiling,Gene Expression Profiling: methods,Gene Expression Profiling: standards,Humans,Messenger,Messenger: analysis,Messenger: genetics,Models,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,Oligonucleotide Array Sequence Analysis: standards,Organ Specificity,Reproducibility of Results,Research Design,RNA,Sensitivity and Specificity,Software,Statistical},
  number = {8},
  pages = {RESEARCH0032},
  title = {Model-Based Analysis of Oligonucleotide Arrays: Model Validation, Design Issues and Standard Error Application.},
  volume = {2}
}

@article{Li2010,
  abstract = {MOTIVATION: RNA-Seq is a promising new technology for accurately measuring gene expression levels. Expression estimation with RNA-Seq requires the mapping of relatively short sequencing reads to a reference genome or transcript set. Because reads are generally shorter than transcripts from which they are derived, a single read may map to multiple genes and isoforms, complicating expression analyses. Previous computational methods either discard reads that map to multiple locations or allocate them to genes heuristically. RESULTS: We present a generative statistical model and associated inference methods that handle read mapping uncertainty in a principled manner. Through simulations parameterized by real RNA-Seq data, we show that our method is more accurate than previous methods. Our improved accuracy is the result of handling read mapping uncertainty with a statistical model and the estimation of gene expression levels as the sum of isoform expression levels. Unlike previous methods, our method is capable of modeling non-uniform read distributions. Simulations with our method indicate that a read length of 20-25 bases is optimal for gene-level expression estimation from mouse and maize RNA-Seq data when sequencing throughput is fixed.},
  author = {Li, Bo and Ruotti, Victor and Stewart, Ron M and a Thomson, James and Dewey, Colin N},
  date = {2010-02-15},
  doi = {10/c6ztk6},
  eprint = {20022975},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Li et al. - 2010 - RNA-Seq gene expression estimation with read mappi.pdf},
  issn = {1367-4811},
  journaltitle = {Bioinformatics (Oxford, England)},
  keywords = {Algorithms,Animals,Base Sequence,Computational Biology,Computational Biology: methods,Databases; Genetic,Gene Expression,Gene Expression Profiling,Genome,Mice,Sequence Analysis; RNA,Sequence Analysis; RNA: methods,Software,Zea mays,Zea mays: genetics},
  number = {4},
  pages = {493-500},
  title = {{{RNA}}-{{Seq}} Gene Expression Estimation with Read Mapping Uncertainty.},
  volume = {26}
}

@article{Li2011,
  abstract = {Reproducibility is essential to reliable scientific discovery in high-throughput experiments. In this work we propose a unified approach to measure the reproducibility of findings identified from replicate experiments and identify putative discoveries using reproducibility. Unlike the usual scalar measures of reproducibility, our approach creates a curve, which quantitatively assesses when the findings are no longer consistent across replicates. Our curve is fitted by a copula mixture model, from which we derive a quantitative reproducibility score, which we call the "irreproducible discovery rate" (IDR) analogous to the FDR. This score can be computed at each set of paired replicate ranks and permits the principled setting of thresholds both for assessing reproducibility and combining replicates. Since our approach permits an arbitrary scale for each replicate, it provides useful descriptive measures in a wide variety of situations to be explored. We study the performance of the algorithm using simulations and give a heuristic analysis of its theoretical properties. We demonstrate the effectiveness of our method in a ChIP-seq experiment. \textcopyright{} Institute of Mathematical Statistics, 2011.},
  author = {Li, Qunhua and Brown, James B. and Huang, Haiyan and Bickel, Peter J.},
  date = {2011-09},
  doi = {10/bwxxjt},
  file = {/Users/ryan/Documents/Zotero Library/Li et al. - 2011 - Measuring reproducibility of high-throughput exper.pdf},
  issn = {1932-6157},
  journaltitle = {The Annals of Applied Statistics},
  keywords = {Association,Copula,Genomics,High-throughput experiment,Irreproducible discovery rate,Iterative algorithm,Mixture model,Reproducibility},
  number = {3},
  pages = {1752-1779},
  title = {Measuring Reproducibility of High-Throughput Experiments},
  volume = {5}
}

@article{Li2012,
  abstract = {We discuss the identification of genes that are associated with an outcome in RNA sequencing and other sequence-based comparative genomic experiments. RNA-sequencing data take the form of counts, so models based on the Gaussian distribution are unsuitable. Moreover, normalization is challenging because different sequencing experiments may generate quite different total numbers of reads. To overcome these difficulties, we use a log-linear model with a new approach to normalization. We derive a novel procedure to estimate the false discovery rate (FDR). Our method can be applied to data with quantitative, two-class, or multiple-class outcomes, and the computation is fast even for large data sets. We study the accuracy of our approaches for significance calculation and FDR estimation, and we demonstrate that our method has potential advantages over existing methods that are based on a Poisson or negative binomial model. In summary, this work provides a pipeline for the significance analysis of sequencing data.},
  author = {Li, Jun and Witten, Daniela M and Johnstone, Iain M and Tibshirani, Robert},
  date = {2012-07},
  doi = {10/b5jhk2},
  eprint = {22003245},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Li et al. - 2012 - Normalization, testing, and false discovery rate e.pdf},
  issn = {1468-4357},
  journaltitle = {Biostatistics (Oxford, England)},
  keywords = {Data Interpretation; Statistical,Humans,Models; Statistical,Reverse Transcriptase Polymerase Chain Reaction,RNA; Messenger,RNA; Messenger: chemistry,RNA; Messenger: genetics,Sequence Analysis; DNA,Sequence Analysis; DNA: methods},
  number = {3},
  pages = {523-38},
  title = {Normalization, Testing, and False Discovery Rate Estimation for {{RNA}}-Sequencing Data.},
  volume = {13}
}

@article{Li2012a,
  abstract = {MOTIVATION: RNA-Seq uses the high-throughput sequencing technology to identify and quantify transcriptome at an unprecedented high resolution and low cost. However, RNA-Seq reads are usually not uniformly distributed and biases in RNA-Seq data post great challenges in many applications including transcriptome assembly and the expression level estimation of genes or isoforms. Much effort has been made in the literature to calibrate the expression level estimation from biased RNA-Seq data, but the effect of biases on transcriptome assembly remains largely unexplored. RESULTS: Here, we propose a statistical framework for both transcriptome assembly and isoform expression level estimation from biased RNA-Seq data. Using a quasi-multinomial distribution model, our method is able to capture various types of RNA-Seq biases, including positional, sequencing and mappability biases. Our experimental results on simulated and real RNA-Seq datasets exhibit interesting effects of RNA-Seq biases on both transcriptome assembly and isoform expression level estimation. The advantage of our method is clearly shown in the experimental analysis by its high sensitivity and precision in transcriptome assembly and the high concordance of its estimated expression levels with qRT-PCR data. AVAILABILITY: CEM is freely available at http://www.cs.ucr.edu/\textasciitilde{}liw/cem.html CONTACT: liw@cs.ucr.edu.},
  author = {Li, Wei and Jiang, Tao},
  date = {2012-10-11},
  doi = {10/f4fr87},
  eprint = {23060617},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Li and Jiang - 2012 - Transcriptome Assembly and Isoform Expression Leve.pdf},
  issn = {1367-4811},
  journaltitle = {Bioinformatics (Oxford, England)},
  number = {22},
  pages = {2914-2921},
  title = {Transcriptome {{Assembly}} and {{Isoform Expression Level Estimation}} from {{Biased RNA}}-{{Seq Reads}}.},
  volume = {28}
}

@article{Li2018,
  abstract = {Background: Since the invention of next-generation RNA sequencing (RNA-seq) technologies, they have become a powerful tool to study the presence and quantity of RNA molecules in biological samples and have revolutionized transcriptomic studies. The analysis of RNA-seq data at four different levels (samples, genes, transcripts, and exons) involve multiple statistical and computational questions, some of which remain challenging up to date. Results: We review RNA-seq analysis tools at the sample, gene, transcript, and exon levels from a statistical perspective. We also highlight the biological and statistical questions of most practical considerations. Conclusions: The development of statistical and computational methods for analyzing RNA-seq data has made significant advances in the past decade. However, methods developed to answer the same biological question often rely on diverse statistical models and exhibit different performance under different scenarios. This review discusses and compares multiple commonly used statistical models regarding their assumptions, in the hope of helping users select appropriate methods as needed, as well as assisting developers for future method development.[Figure not available: see fulltext.].},
  archivePrefix = {arXiv},
  author = {Li, Wei Vivian and Li, Jingyi Jessica},
  date = {2018},
  doi = {10/ggcxkv},
  eprint = {1804.06050},
  eprinttype = {arxiv},
  file = {/Users/ryan/Documents/Zotero Library/Li and Li - 2018 - Modeling and analysis of RNA-seq data a review fr.pdf},
  issn = {20954697},
  journaltitle = {Quantitative Biology},
  keywords = {alternatively spliced exons,differentially expressed genes,isoform reconstruction and quantification,RNA-seq,statistical modeling},
  number = {3},
  pages = {195-209},
  title = {Modeling and Analysis of {{RNA}}-Seq Data: A Review from a Statistical Perspective},
  volume = {6}
}

@article{Liang2008,
  author = {Liang, Kun},
  date = {2008},
  file = {/Users/ryan/Documents/Zotero Library/Liang - 2008 - Further Details of DBChIP.pdf},
  number = {1},
  pages = {1-10},
  title = {Further {{Details}} of {{DBChIP}}}
}

@article{Liang2012,
  abstract = {ABSTRACT: BACKGROUND: ChIP-seq has become an important tool for identifying genome-wide protein-DNA interactions, including transcription factor binding and histone modifications. In ChIP-seq experiments, ChIP samples are usually coupled with their matching control samples. Proper normalization between the ChIP and control samples is an essential aspect of ChIP-seq data analysis RESULTS: We have developed a data-adaptive method for estimating the normalization factor between the ChIP and the control samples. Our method, named as NCIS (Normalization of ChIP-Seq) can accommodate both low and high sequencing depth datasets. We compare statistical properties of NCIS against existing methods in a set of diverse simulation settings, where NCIS enjoys the best estimation precision. In addition, we illustrate the impact of the normalization factor in FDR control and show that NCIS leads to more power among methods that control FDR at nominal levels. CONCLUSION: Our results indicate that the proper normalization between the ChIP and control samples is an important step in ChIP-seq analysis in terms of power and error rate control. Our proposed method shows excellent statistical properties and is useful in the full range of ChIP-seq applications, especially with deeply sequenced data.},
  author = {Liang, Kun and Keles, Sunduz},
  date = {2012-08-10},
  doi = {10/gb8t7q},
  eprint = {22883957},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Liang and Keles - 2012 - Normalization of ChIP-seq data with control..pdf},
  isbn = {1471210513199},
  issn = {1471-2105},
  journaltitle = {BMC bioinformatics},
  number = {1},
  pages = {199},
  title = {Normalization of {{ChIP}}-Seq Data with Control.},
  volume = {13}
}

@article{Liang2012a,
  abstract = {Increasing number of ChIP-seq experiments are investigating transcription factor binding under multiple experimental conditions, for example, various treatment conditions, several distinct time points and different treatment dosage levels. Hence, identifying differential binding sites across multiple conditions is of practical importance in biological and medical research. To this end, we have developed a powerful and flexible program, called DBChIP, to detect differentially bound sharp binding sites across multiple conditions, with or without matching control samples. By assigning uncertainty measure to the putative differential binding sites, DBChIP facilitates downstream analysis. DBChIP is implemented in R programming language and can work with a wide range of sequencing file formats.},
  author = {Liang, Kun and Keles, S\"und\"uz},
  date = {2012-01-01},
  doi = {10/d64vrb},
  eprint = {22057161},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Liang and Keles - 2012 - Detecting differential binding of transcription fa.pdf},
  issn = {1367-4811},
  journaltitle = {Bioinformatics (Oxford, England)},
  number = {1},
  pages = {121-2},
  title = {Detecting Differential Binding of Transcription Factors with {{ChIP}}-Seq.},
  volume = {28}
}

@article{Liao2013a,
  abstract = {Read alignment is an ongoing challenge for the analysis of data from sequencing technologies. This article proposes an elegantly simple multi-seed strategy, called seed-and-vote, for mapping reads to a reference genome. The new strategy chooses the mapped genomic location for the read directly from the seeds. It uses a relatively large number of short seeds (called subreads) extracted from each read and allows all the seeds to vote on the optimal location. When the read length is {$<$}160 bp, overlapping subreads are used. More conventional alignment algorithms are then used to fill in detailed mismatch and indel information between the subreads that make up the winning voting block. The strategy is fast because the overall genomic location has already been chosen before the detailed alignment is done. It is sensitive because no individual subread is required to map exactly, nor are individual subreads constrained to map close by other subreads. It is accurate because the final location must be supported by several different subreads. The strategy extends easily to find exon junctions, by locating reads that contain sets of subreads mapping to different exons of the same gene. It scales up efficiently for longer reads.},
  author = {Liao, Yang and Smyth, Gordon K. and Shi, Wei},
  date = {2013-05-01},
  doi = {10/gfvj8g},
  eprint = {23558742},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Liao et al. - 2013 - The Subread aligner fast, accurate and scalable r.pdf},
  issn = {1362-4962},
  journaltitle = {Nucleic acids research},
  keywords = {Exons,Genomics,High-Throughput Nucleotide Sequencing,INDEL Mutation,Sequence Alignment,Sequence Alignment: methods,Software},
  number = {10},
  pages = {e108},
  title = {The {{Subread}} Aligner: Fast, Accurate and Scalable Read Mapping by Seed-and-Vote.},
  volume = {41}
}

@article{Liao2014,
  abstract = {Motivation: Next-generation sequencing technologies generate millions of short sequence reads, which are usually aligned to a reference genome. In many applications, the key information required for downstream analysis is the number of reads mapping to each genomic feature, for example to each exon or each gene. The process of counting reads is called read summarization. Read summarization is required for a great variety of genomic analyses but has so far received relatively little attention in the literature.\textbackslash{}nResults: We present featureCounts, a read summarization program suitable for counting reads generated from either RNA or genomic DNA sequencing experiments. featureCounts implements highly efficient chromosome hashing and feature blocking techniques. It is considerably faster than existing methods (by an order of magnitude for gene-level summarization) and requires far less computer memory. It works with either single or paired-end reads and provides a wide range of options appropriate for different sequencing applications.\textbackslash{}nAvailability and implementation: featureCounts is available under GNU General Public License as part of the Subread (http://subread.sourceforge.net) or Rsubread (http://www.bioconductor.org) software packages.\textbackslash{}nContact: shi@wehi.edu.au},
  archivePrefix = {arXiv},
  author = {Liao, Yang and Smyth, Gordon K. and Shi, Wei},
  date = {2014},
  doi = {10/f5w7rp},
  eprint = {24227677},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Liao et al. - 2014 - FeatureCounts An efficient general purpose progra.pdf},
  isbn = {1367-4811 (Electronic) 1367-4803 (Linking)},
  issn = {14602059},
  journaltitle = {Bioinformatics},
  number = {7},
  pages = {923-930},
  title = {{{FeatureCounts}}: {{An}} Efficient General Purpose Program for Assigning Sequence Reads to Genomic Features},
  volume = {30}
}

@article{Liu2013,
  author = {Liu, Yun and Aryee, Martin J and Padyukov, Leonid and Fallin, M Daniele and Hesselberg, Espen and Runarsson, Arni and Reinius, Lovisa and Acevedo, Nathalie and Taub, Margaret and Ronninger, Marcus and Shchetynsky, Klementy and Scheynius, Annika and Kere, Juha and Alfredsson, Lars and Klareskog, Lars and Ekstr\"om, Tomas J and Feinberg, Andrew P},
  date = {2013-01-20},
  doi = {10/f24vb9},
  file = {/Users/ryan/Documents/Zotero Library/Liu et al. - 2013 - Epigenome-wide association data implicate DNA meth.pdf},
  issn = {1087-0156},
  issue = {November 2012},
  journaltitle = {Nature Biotechnology},
  pages = {142-147},
  title = {Epigenome-Wide Association Data Implicate {{DNA}} Methylation as an Intermediary of Genetic Risk in Rheumatoid Arthritis},
  volume = {31}
}

@article{Liu2015,
  abstract = {Variations in sample quality are frequently encountered in small RNA-sequencing experiments, and pose a major challenge in a differential expression analysis. Removal of high variation samples reduces noise, but at a cost of reducing power, thus limiting our ability to detect biologically meaningful changes. Similarly, retaining these samples in the analysis may not reveal any statistically significant changes due to the higher noise level. A compromise is to use all available data, but to down-weight the observations from more variable samples. We describe a statistical approach that facilitates this by modelling heterogeneity at both the sample and observational levels as part of the differential expression analysis. At the sample level this is achieved by fitting a log-linear variance model that includes common sample-specific or group-specific parameters that are shared between genes. The estimated sample variance factors are then converted to weights and combined with observational level weights obtained from the mean-variance relationship of the log-counts-per-million using 'voom'. A comprehensive analysis involving both simulations and experimental RNA-sequencing data demonstrates that this strategy leads to a universally more powerful analysis and fewer false discoveries when compared to conventional approaches. This methodology has wide application and is implemented in the open-source 'limma' package.},
  author = {Liu, Ruijie and Holik, Aliaksei Z. and Su, Shian and Jansz, Natasha and Chen, Kelan and Leong, Huei San and Blewitt, Marnie E. and Asselin-Labat, Marie Liesse and Smyth, Gordon K. and Ritchie, Matthew E.},
  date = {2015-09-03},
  doi = {10/f7rq7c},
  file = {/Users/ryan/Documents/Zotero Library/Liu et al. - 2015 - Why weight Modelling sample and observational lev.pdf},
  issn = {13624962},
  journaltitle = {Nucleic Acids Research},
  number = {15},
  pages = {e97-e97},
  title = {Why Weight? {{Modelling}} Sample and Observational Level Variability Improves Power in {{RNA}}-Seq Analyses},
  volume = {43}
}

@article{Lo2011,
  abstract = {Humans are diploid, carrying two copies of each chromosome, one from each parent. Separating the paternal and maternal chromosomes is an important component of genetic analyses such as determining genetic association, inferring evolutionary scenarios, computing recombination rates, and detecting cis-regulatory events. As the pair of chromosomes are mostly identical to each other, linking together of alleles at heterozygous sites is sufficient to phase, or separate the two chromosomes. In Haplotype Assembly, the linking is done by sequenced fragments that overlap two heterozygous sites. While there has been a lot of research on correcting errors to achieve accurate haplotypes via assembly, relatively little work has been done on designing sequencing experiments to get long haplotypes. Here, we describe the different design parameters that can be adjusted with next generation and upcoming sequencing technologies, and study the impact of design choice on the length of the haplotype.},
  author = {Lo, Christine and Bashir, Ali and Bansal, Vikas and Bafna, Vineet},
  date = {2011-01},
  doi = {10/ft8hdg},
  eprint = {21342554},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Lo et al. - 2011 - Strobe sequence design for haplotype assembly..pdf},
  issn = {1471-2105},
  issue = {Suppl 1},
  journaltitle = {BMC bioinformatics},
  keywords = {Algorithms,Computational Biology,Computational Biology: methods,Genome; Human,Genomics,Genomics: methods,Haplotypes,Humans,Polymorphism; Single Nucleotide,Sequence Analysis; DNA,Sequence Analysis; DNA: methods},
  pages = {S24},
  title = {Strobe Sequence Design for Haplotype Assembly.},
  volume = {12 Suppl 1}
}

@article{London2000,
  abstract = {To examine the functional characteristics of memory CD4+ T cells, we used an adoptive transfer system to generate a stable population of Ag-specific memory cells in vivo and compared their responses to Ag with those of a similar population of Ag-specific naive cells. Memory cells localized to the spleen and lymph nodes of mice and exhibited extremely rapid recall responses to Ag in vivo, leaving the spleen within 3-5 days of Ag encounter. Unlike their naive counterparts, memory cells produced effector cytokines (IFN-gamma, IL-4, IL-5) within 12-24 h of Ag exposure and did not require multiple cycles of cell division to do so. Memory cells proliferated at lower Ag concentrations than did naive cells, were less dependent on costimulation by B7 molecules, and independent of costimulation by CD40. Furthermore, effector cytokine production by memory cells also occurred in the absence of either B7 or CD40 costimulation. Lastly, memory cells were resistant to tolerance induction. Together, these findings suggest that the threshold for activation of memory CD4+ cells is lower than that of naive cells. This would permit memory cells to rapidly express their effector functions in vivo earlier in the course of a secondary immune response, when the levels of Ag and the availability of costimulation may be relatively low.},
  author = {London, Cheryl A. and Lodge, Michael P. and Abbas, Abul K.},
  date = {2000},
  doi = {10/ggcxkw},
  eprint = {10605020},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/London et al. - 2000 - Functional Responses and Costimulator Dependence o.pdf},
  issn = {0022-1767},
  journaltitle = {The Journal of Immunology},
  number = {1},
  pages = {265-272},
  title = {Functional {{Responses}} and {{Costimulator Dependence}} of {{Memory CD4}} + {{T Cells}}},
  volume = {164}
}

@report{Love2014,
  author = {Love, Michael I and Huber, Wolfgang and Anders, Simon},
  date = {2014-02-19},
  doi = {10/rw5},
  file = {/Users/ryan/Documents/Zotero Library/Love et al. - 2014 - Moderated estimation of fold change and dispersion.pdf},
  title = {Moderated Estimation of Fold Change and Dispersion for {{RNA}}-{{Seq}} Data with {{DESeq2}}},
  url = {http://biorxiv.org/lookup/doi/10.1101/002832},
  urldate = {2014-07-09}
}

@article{Love2015,
  abstract = {RNA-seq technology is widely used in biomedical and basic science research. These studies rely on complex computational methods that quantify expression levels for observed transcripts. We find that current computational methods can lead to hundreds of false positive results related to alternative isoform usage. This flaw in the current methodology stems from a lack of modeling sample-specific bias that leads to drops in coverage and is related to sequence features like fragment GC content and GC stretches. By incorporating features that explain this bias into transcript expression models, we greatly increase the specificity of transcript expression estimates, with more than a four-fold reduction in the number of false positives for reported changes in expression. We introduce alpine, a method for estimation of bias-corrected transcript abundance. The method is available as a Bioconductor package that includes data visualization tools useful for bias discovery.},
  author = {Love, Michael I and Hogenesch, John B and Irizarry, Rafael A},
  date = {2015},
  doi = {10/ggcxkx},
  eprint = {27669167},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Love et al. - 2015 - Modeling of RNA-seq fragment sequence bias reduces.pdf},
  isbn = {1546-1696 (Electronic) 1087-0156 (Linking)},
  issn = {1087-0156},
  journaltitle = {bioRxiv},
  pages = {025767},
  title = {Modeling of {{RNA}}-Seq Fragment Sequence Bias Reduces Systematic Errors in Transcript Abundance Estimation}
}

@article{Lu2005,
  abstract = {In testing for differential gene expression involving multiple serial analysis of gene expression (SAGE) libraries, it is critical to account for both between and within library variation. Several methods have been proposed, including the t test, tw test, and an overdispersed logistic regression approach. The merits of these tests, however, have not been fully evaluated. Questions still remain on whether further improvements can be made.},
  author = {Lu, Jun and Tomfohr, John K and Kepler, Thomas B},
  date = {2005-01},
  doi = {10/bkh3st},
  eprint = {15987513},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Lu et al. - 2005 - Identifying differential expression in multiple SA.pdf},
  isbn = {1471210561},
  issn = {1471-2105},
  journaltitle = {BMC bioinformatics},
  keywords = {Carcinoma; Pancreatic Ductal,Carcinoma; Pancreatic Ductal: genetics,Cell Line; Tumor,Gene Expression Profiling,Gene Expression Profiling: methods,Gene Expression Profiling: statistics & numerical,Gene Library,Humans,Internet,Linear Models,Pancreatic Neoplasms,Pancreatic Neoplasms: genetics,RNA; Messenger,RNA; Messenger: analysis,ROC Curve,User-Computer Interface},
  pages = {165},
  title = {Identifying Differential Expression in Multiple {{SAGE}} Libraries: An Overdispersed Log-Linear Model Approach.},
  volume = {6}
}

@article{Luco2010,
  abstract = {Alternative splicing of pre-mRNA is a prominent mechanism to generate protein diversity, yet its regulation is poorly understood. We demonstrated a direct role for histone modifications in alternative splicing. We found distinctive histone modification signatures that correlate with the splicing outcome in a set of human genes, and modulation of histone modifications causes splice site switching. Histone marks affect splicing outcome by influencing the recruitment of splicing regulators via a chromatin-binding protein. These results outline an adaptor system for the reading of histone marks by the pre-mRNA splicing machinery.},
  author = {Luco, Reini F and Pan, Qun and Tominaga, Kaoru and Blencowe, Benjamin J and Pereira-Smith, Olivia M and Misteli, Tom},
  date = {2010-02-19},
  doi = {10/bqv9fv},
  eprint = {20133523},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Luco et al. - 2010 - Regulation of Alternative Splicing by Histone Modi.pdf},
  issn = {0036-8075},
  journaltitle = {Science},
  keywords = {Alternative Splicing,Cell Line,Chromatin,Chromatin: metabolism,Epithelial Cells,Epithelial Cells: metabolism,Exons,Fibroblast Growth Factor,Histone-Lysine N-Methyltransferase,Histone-Lysine N-Methyltransferase: genetics,Histone-Lysine N-Methyltransferase: metabolism,Histones,Histones: metabolism,Humans,Male,Mesenchymal Stem Cells,Mesenchymal Stem Cells: metabolism,Polypyrimidine Tract-Binding Protein,Polypyrimidine Tract-Binding Protein: metabolism,Prostate,Prostate: cytology,Protein Binding,Receptor,RNA Precursors,RNA Precursors: metabolism,Transcription Factors,Transcription Factors: genetics,Transcription Factors: metabolism,Type 2,Type 2: geneti,Type 2: genetics},
  number = {5968},
  pages = {996-1000},
  title = {Regulation of {{Alternative Splicing}} by {{Histone Modifications}}},
  volume = {327}
}

@article{Lun2014,
  abstract = {A common aim in ChIP-seq experiments is to identify changes in protein binding patterns between conditions, i.e. differential binding. A number of peak- and window-based strategies have been developed to detect differential binding when the regions of interest are not known in advance. However, careful consideration of error control is needed when applying these methods. Peak-based approaches use the same data set to define peaks and to detect differential binding. Done improperly, this can result in loss of type I error control. For window-based methods, controlling the false discovery rate over all detected windows does not guarantee control across all detected regions. Misinterpreting the former as the latter can result in unexpected liberalness. Here, several solutions are presented to maintain error control for these de novo counting strategies. For peak-based methods, peak calling should be performed on pooled libraries prior to the statistical analysis. For window-based methods, a hybrid approach using Simes' method is proposed to maintain control of the false discovery rate across regions. More generally, the relative advantages of peak- and window-based strategies are explored using a range of simulated and real data sets. Implementations of both strategies also compare favourably to existing programs for differential binding analyses.},
  author = {Lun, Aaron T.L. and Smyth, Gordon K},
  date = {2014-06-17},
  doi = {10/f5874p},
  eprint = {24852250},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Lun and Smyth - 2014 - De novo detection of differentially bound regions .pdf},
  issn = {0305-1048},
  journaltitle = {Nucleic Acids Research},
  keywords = {★},
  number = {11},
  pages = {e95-e95},
  title = {De Novo Detection of Differentially Bound Regions for {{ChIP}}-Seq Data Using Peaks and Windows: Controlling Error Rates Correctly},
  volume = {42}
}

@article{Lun2015,
  author = {Lun, Aaron T L and Chen, Yunshun},
  date = {2015},
  file = {/Users/ryan/Documents/Zotero Library/Lun and Chen - 2015 - It ’ s DE-licious  a recipe for differential expr.pdf},
  issue = {April},
  pages = {1-23},
  title = {It ' s {{DE}}-Licious : A Recipe for Differential Expression Analyses of {{RNA}}-Seq Experiments Using Quasi-Likelihood Methods in {{edgeR}}},
  volume = {1418}
}

@article{Lun2015a,
  abstract = {Chromatin immunoprecipitation with massively parallel sequencing (ChIP-seq) is widely used to identify binding sites for a target protein in the genome. An important scientific application is to identify changes in protein binding between different treatment conditions, i.e. to detect differential binding. This can reveal potential mechanisms through which changes in binding may contribute to the treatment effect. The csaw package provides a framework for the de novo detection of differentially bound genomic regions. It uses a window-based strategy to summarize read counts across the genome. It exploits existing statistical software to test for significant differences in each window. Finally, it clusters windows into regions for output and controls the false discovery rate properly over all detected regions. The csaw package can handle arbitrarily complex experimental designs involving biological replicates. It can be applied to both transcription factor and histone mark datasets, and, more generally, to any type of sequencing data measuring genomic coverage. csaw performs favorably against existing methods for de novo DB analyses on both simulated and real data. csaw is implemented as a R software package and is freely available from the open-source Bioconductor project.},
  author = {Lun, Aaron T.L. and Smyth, Gordon K.},
  date = {2015-03-18},
  doi = {10/f8g6nw},
  file = {/Users/ryan/Documents/Zotero Library/Lun and Smyth - 2015 - Csaw A Bioconductor package for differential bind.pdf},
  issn = {13624962},
  journaltitle = {Nucleic Acids Research},
  number = {5},
  pages = {e45},
  title = {Csaw: {{A Bioconductor}} Package for Differential Binding Analysis of {{ChIP}}-Seq Data Using Sliding Windows},
  volume = {44}
}

@article{Lund2012,
  abstract = {Next generation sequencing technology provides a powerful tool for measuring gene expression (mRNA) levels in the form of RNA-sequence data. Method development for identifying differentially expressed (DE) genes from RNA-seq data, which frequently includes many low-count integers and can exhibit severe overdispersion relative to Poisson or binomial distributions, is a popular area of ongoing research. Here we present quasi-likelihood methods with shrunken dispersion estimates based on an adaptation of Smyth's (2004) approach to estimating gene-specific error variances for microarray data. Our suggested methods are computationally simple, analogous to ANOVA and compare favorably versus competing methods in detecting DE genes and estimating false discovery rates across a variety of simulations based on real data.},
  author = {Lund, Steven P and Nettleton, Dan and McCarthy, Davis J and Smyth, Gordon K},
  date = {2012-01},
  doi = {10/f95zdf},
  eprint = {23104842},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Lund et al. - 2012 - Detecting differential expression in RNA-sequence .pdf},
  issn = {1544-6115},
  journaltitle = {Statistical applications in genetics and molecular biology},
  keywords = {Base Sequence,Databases,Gene Expression Profiling,Gene Expression Profiling: methods,Gene Expression Profiling: statistics & numerical,Genetic,Likelihood Functions,Messenger,Messenger: metabolism,RNA,RNA: methods,Sequence Analysis},
  number = {5},
  title = {Detecting Differential Expression in {{RNA}}-Sequence Data Using Quasi-Likelihood with Shrunken Dispersion Estimates.},
  volume = {11}
}

@article{Luo2013,
  author = {Luo, W. and Brouwer, C.},
  date = {2013-06-04},
  doi = {10/f44j5k},
  file = {/Users/ryan/Documents/Zotero Library/Luo and Brouwer - 2013 - Pathview an RBioconductor package for pathway ba.pdf},
  issn = {1367-4803},
  journaltitle = {Bioinformatics},
  pages = {1-2},
  title = {Pathview: An {{R}}/{{Bioconductor}} Package for Pathway Based Data Integration and Visualization}
}

@report{Maciuca2016,
  author = {Maciuca, Sorina and del Ojo Elias, Carlos and McVean, Gil and Iqbal, Zamin},
  date = {2016-06-15},
  doi = {10/dd8v},
  file = {/Users/ryan/Documents/Zotero Library/Maciuca et al. - 2016 - A natural encoding of genetic variation in a Burro.pdf},
  keywords = {burrows-wheeler transform,fm index,genome,pan-genome},
  options = {useprefix=true},
  pages = {1-10},
  title = {A Natural Encoding of Genetic Variation in a {{Burrows}}-{{Wheeler Transform}} to Enable Mapping and Genome Inference},
  url = {http://biorxiv.org/lookup/doi/10.1101/059170}
}

@article{Madrigal2016,
  abstract = {Computational evaluation of variability across DNA or RNA sequencing datasets is a crucial step in genomic science, as it allows both to evaluate the reproducibility across biological or technical replicates, and to compare different datasets to identify their potential correlations. Here I present fCCAC, an application of functional canonical correlation analysis to assess covariance of nucleic acid sequencing datasets such as chromatin immunoprecipitation followed by deep sequencing (ChIP-seq). I exemplify how this method can reveal shared covariance between histone modifications and DNA binding proteins, such as the relationship between the H3K4me3 chromatin mark and its epigenetic writers and readers. R code is publicly available at http://github.com/pmb59/fCCAC/.},
  author = {Madrigal, Pedro},
  date = {2016},
  doi = {10/ggcxkz},
  file = {/Users/ryan/Documents/Zotero Library/Madrigal - 2016 - fCCAC functional canonical correlation analysis t.pdf},
  journaltitle = {bioRxiv},
  pages = {060780},
  title = {{{fCCAC}}: Functional Canonical Correlation Analysis to Evaluate Covariance between Nucleic Acid Sequencing Datasets}
}

@book{Maindonald2008,
  author = {Maindonald, J H},
  date = {2008-01-19},
  file = {/Users/ryan/Documents/Zotero Library/Maindonald - 2008 - Using R for Data Analysis and Graphics Introductio.pdf},
  keywords = {⛔ No DOI found},
  title = {Using {{R}} for {{Data Analysis}} and {{Graphics Introduction}}, {{Code}} and {{Commentary}}}
}

@article{Majewski2010,
  author = {Majewski, Ian J and Ritchie, Matthew E and Phipson, Belinda and Corbin, Jason and Pakusch, Miha and Ebert, Anja and Busslinger, Meinrad and Koseki, Haruhiko and Hu, Yifang and Smyth, Gordon K and Alexander, Warren S and Douglas, J and Blewitt, Marnie E and Hilton, Douglas J},
  date = {2010},
  doi = {10/dcvjmc},
  file = {/Users/ryan/Documents/Zotero Library/Majewski et al. - 2010 - stem and progenitor cells Opposing roles of polyco.pdf},
  isbn = {2009122607},
  journaltitle = {Blood},
  number = {5},
  pages = {731-739},
  title = {Stem and Progenitor Cells {{Opposing}} Roles of Polycomb Repressive Complexes in Hematopoietic Stem and Progenitor Cells},
  volume = {116}
}

@article{Majumdar2003,
  abstract = {We have characterized adhesion molecules on the surface of multipotential human mesenchymal stem cells (hMSCs) and identified molecules whose ligands are present on mature hematopoietic cells. Flow cytometric analysis of hMSCs identified the expression of integrins: {$\alpha$}1, {$\alpha$}2, {$\alpha$}3, {$\alpha$}5, {$\alpha$}6, {$\alpha$}v, {$\beta$}1, {$\beta$}3, and {$\beta$}4, in addition to ICAM-1, ICAM-2, VCAM-1, CD72, and LFA-3. Exposure of hMSCs to IL-1{$\alpha$}, TNF{$\alpha$}or IFN{$\gamma$} up-modulated ICAM-1 surface expression, whereas only IFN{$\gamma$} increased both HLA-class I and -class II molecules on the cell surface. Whole cell-binding assays between the hMSCs and hemato-poietic cell lines showed that T lymphocytic lines bound hMSCs with higher affinity than lines of either B lymphocytes or those of myeloid lineage. Experiments using autologous T lymphocytes isolated from peripheral blood mononuclear cells showed that hMSCs exhibited increased affinity for activated T-lymphocytes compared to resting T cells by quantitative whole cell binding and rosetting assays. Flow cytometric analysis of rosetted cells demonstrated that both CD4+ and CD8+ cells bound to hMSCs. To determine the functional significance of these findings, we tested the ability of hMSCs to present antigen to T lymphocytes. hMSCs pulsed with tetanus toxoid stimulated proliferation and cytokine production (IL-4, IL-10, and IFN{$\gamma$}) in a tetanus-toxoid-specific T cell line. Maximal cytokine production correlated with maximal antigen-dependent proliferation. These data demonstrate physiological outcome as a consequence of interactions between hMSCs and human hematopoietic lineage cells, suggesting a role for hMSCs in vivo to influence both hematopoietic and immune function(s). Copyright \textcopyright{} 2003 National Science Council, ROC and S. Karger AG, Basel.},
  author = {Majumdar, Manas K. and Keane-Moore, Michele and Buyaner, Diana and Hardy, Wayne B. and Moorman, Mark A. and McIntosh, Kevin R. and Mosca, Joseph D.},
  date = {2003},
  doi = {10/b7sw4z},
  file = {/Users/ryan/Documents/Zotero Library/Majumdar et al. - 2003 - Characterization and functionality of cell surface.pdf},
  issn = {10217770},
  journaltitle = {Journal of Biomedical Science},
  keywords = {Antigen presentation,Hematopoietic interactions,Immune function,Mesenchymal stem cells,T lymphocytes},
  number = {2},
  pages = {228-241},
  title = {Characterization and Functionality of Cell Surface Molecules on Human Mesenchymal Stem Cells},
  volume = {10}
}

@article{Maksimovic2012,
  abstract = {DNA methylation is the most widely studied epigenetic mark and is known to be essential to normal development and frequently disrupted in disease. The Illumina HumanMethylation450 BeadChip assays the methylation status of CpGs at 485,577 sites across the genome. Here we present Subset-quantile Within Array Normalization (SWAN), a new method that substantially improves the results from this platform by reducing technical variation within and between arrays. SWAN is available in the minfi Bioconductor package.},
  author = {Maksimovic, Jovana and Gordon, Lavinia and Oshlack, Alicia},
  date = {2012},
  doi = {10/ggcxk2},
  file = {/Users/ryan/Documents/Zotero Library/Maksimovic et al. - 2012 - SWAN Subset-quantile within array normalization f.pdf},
  issn = {14656914},
  journaltitle = {Genome biology},
  number = {6},
  pages = {R44},
  title = {{{SWAN}}: {{Subset}}-Quantile within Array Normalization for Illumina Infinium {{HumanMethylation450 BeadChips}}.},
  volume = {13}
}

@article{Manno2017,
  abstract = {RNA abundance is a powerful indicator of the state of individual cells, but does not directly reveal dynamic processes such as cellular differentiation. Here we show that RNA velocity\textemdash{}the time derivative of RNA abundance\textemdash{}can be estimated by distinguishing unspliced and spliced mRNAs in standard single-cell RNA sequencing protocols. We show that RNA velocity is a vector that predicts the future state of individual cells on a timescale of hours. We validate the accuracy of RNA velocity in the neural crest lineage, demonstrate its use on multiple technical platforms, reconstruct the branching lineage tree of the mouse hippocampus, and measure RNA kinetics in human embryonic brain. We expect RNA velocity to greatly aid the analysis of developmental lineages and cellular dynamics, particularly in humans.},
  author = {Soldatov, Ruslan and Hochgerner, Hannah and Zeisel, Amit and Petukhov, Viktor and Kastriti, Maria and Lonnerberg, Peter and Furlan, Alessandro and Fan, Jean and Liu, Zehua and Guo, Jimin and Sundstrom, Erik and Castelo-Branco, Goncalo and Adameyko, Igor and Linnarsson, Sten and Kharchenko, Peter},
  date = {2017},
  doi = {10/ggcxmq},
  file = {/Users/ryan/Documents/Zotero Library/Soldatov et al. - 2017 - RNA velocity in single cells.pdf},
  issn = {0028-0836},
  journaltitle = {bioRxiv},
  pages = {206052},
  title = {{{RNA}} Velocity in Single Cells}
}

@article{Marin-Bejar2013,
  abstract = {BACKGROUND The p53 transcription factor is located at the core of a complex wiring of signaling pathways that are critical for the preservation of cellular homeostasis. Only recently it has become clear that p53 regulates the expression of several long intergenic noncoding RNAs (lincRNAs). However, relatively little is known about the role that lincRNAs play in this pathway. RESULTS Here we characterize a lincRNA named Pint (p53 induced noncoding transcript). We show that Pint is aubiquitously expressed lincRNA that is finely regulated by p53. In mouse cells, Pint promotes cell proliferation and survival by regulating the expression of genes of the TGF-b, MAPK and p53 pathways. Pint is a nuclear lincRNA that directly interacts with the Polycomb repressive complex 2 (PRC2), and is required for PRC2 targeting of specific genes for H3K27 tri-methylation and repression. Furthermore, Pint functional activity is highly dependent on PRC2 expression. We have also identified Pint human ortholog (PINT), which presents suggestive analogies with the murine lincRNA. PINT is similarly regulated by p53, and its expression significantly correlates with the same cellular pathways as the mouse ortholog, including the p53 pathway. Interestingly, PINT is downregulated in colon primary tumors, while its overexpression inhibits the proliferation of tumor cells, suggesting a possible role as tumor suppressor. CONCLUSIONS Our results reveal a p53 autoregulatory negative mechanism where a lincRNA connects p53 activation with epigenetic silencing by PRC2. Additionally, we show analogies and differences between the murine and human orthologs, identifying a novel tumor suppressor candidate lincRNA.},
  author = {Mar\'in-B\'ejar, Oskar and Marchese, Francesco P and Athie, Alejandro and S\'anchez, Yolanda and Gonz\'alez, Jovanna and Segura, Victor and Huang, Lulu and Moreno, Isabel and Navarro, Alfons and Monz\'o, Mariano and Garc\'ia-Foncillas, Jes\'us and Rinn, John L and Guo, Shuling and Huarte, Maite},
  date = {2013},
  doi = {10/gbd9hk},
  eprint = {24070194},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Marín-Béjar et al. - 2013 - Pint lincRNA connects the p53 pathway with epigene.pdf},
  isbn = {1465-6914 (Electronic) 1465-6906 (Linking)},
  issn = {1474-760X},
  journaltitle = {Genome biology},
  keywords = {gene regulation,lincrna,non-coding rna,p53,polycomb repressive complex 2},
  number = {9},
  pages = {R104},
  title = {Pint {{lincRNA}} Connects the P53 Pathway with Epigenetic Silencing by the {{Polycomb}} Repressive Complex 2.},
  volume = {14}
}

@article{Marioni2008,
  abstract = {Ultra-high-throughput sequencing is emerging as an attractive alternative to microarrays for genotyping, analysis of methylation patterns, and identification of transcription factor binding sites. Here, we describe an application of the Illumina sequencing (formerly Solexa sequencing) platform to study mRNA expression levels. Our goals were to estimate technical variance associated with Illumina sequencing in this context and to compare its ability to identify differentially expressed genes with existing array technologies. To do so, we estimated gene expression differences between liver and kidney RNA samples using multiple sequencing replicates, and compared the sequencing data to results obtained from Affymetrix arrays using the same RNA samples. We find that the Illumina sequencing data are highly replicable, with relatively little technical variation, and thus, for many purposes, it may suffice to sequence each mRNA sample only once (i.e., using one lane). The information in a single lane of Illumina sequencing data appears comparable to that in a single array in enabling identification of differentially expressed genes, while allowing for additional analyses such as detection of low-expressed genes, alternative splice variants, and novel transcripts. Based on our observations, we propose an empirical protocol and a statistical framework for the analysis of gene expression using ultra-high-throughput sequencing technology.},
  author = {Marioni, John C and Mason, Christopher E and Mane, Shrikant M and Stephens, Matthew and Gilad, Yoav},
  date = {2008-09},
  doi = {10/dd965f},
  eprint = {18550803},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Marioni et al. - 2008 - RNA-seq an assessment of technical reproducibilit.pdf},
  issn = {1088-9051},
  journaltitle = {Genome research},
  keywords = {Gene Expression Profiling,Gene Expression Profiling: methods,Humans,Likelihood Functions,Male,Models; Biological,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,Reproducibility of Results,RNA; Messenger,RNA; Messenger: chemistry,RNA; Messenger: metabolism,Sequence Analysis; RNA,Sequence Analysis; RNA: methods},
  number = {9},
  pages = {1509-17},
  title = {{{RNA}}-Seq: An Assessment of Technical Reproducibility and Comparison with Gene Expression Arrays.},
  volume = {18}
}

@article{Martin2012,
  abstract = {We propose a flexible and identifiable version of the two-groups model, motivated by hierarchical Bayes considerations, that features an empirical null and a semiparametric mixture model for the non-null cases. We use a computationally efficient predictive recursion marginal likelihood procedure to estimate the model parameters, even the nonparametric mixing distribution. This leads to a nonparametric empirical Bayes testing procedure, which we call PRtest, based on thresholding the estimated local false discovery rates. Simulations and real-data examples demonstrate that, compared to existing approaches, PRtest's careful handling of the non-null density can give a much better fit in the tails of the mixture distribution which, in turn, can lead to more realistic conclusions.},
  archivePrefix = {arXiv},
  author = {Martin, Ryan and Tokdar, Surya T.},
  date = {2012},
  doi = {10/c6v3g4},
  eprint = {22085895},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Martin and Tokdar - 2012 - A nonparametric empirical Bayes framework for larg.pdf},
  isbn = {1465-4644},
  issn = {14654644},
  journaltitle = {Biostatistics},
  keywords = {Dirichlet process,Marginal likelihood,Mixture model,Predictive recursion,Two-groups model},
  number = {3},
  pages = {427-439},
  title = {A Nonparametric Empirical {{Bayes}} Framework for Large-Scale Multiple Testing},
  volume = {13}
}

@article{Massa2010a,
  abstract = {Recently, a great effort in microarray data analysis is directed towards the study of the so-called gene sets. A gene set is defined by genes that are, somehow, functionally related. For example, genes appearing in a known biological pathway naturally define a gene set. The gene sets are usually identified from a priori biological knowledge. Nowadays, many bioinformatics resources store such kind of knowledge (see, for example, the Kyoto Encyclopedia of Genes and Genomes, among others). Although pathways maps carry important information about the structure of correlation among genes that should not be neglected, the currently available multivariate methods for gene set analysis do not fully exploit it.},
  author = {Massa, Maria Sofia and Chiogna, Monica and Romualdi, Chiara},
  date = {2010-09-01},
  doi = {10.1186/1752-0509-4-121},
  eprint = {20809931},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Massa et al. - 2010 - Gene set analysis exploiting the topology of a pat.pdf;/Users/ryan/Zotero/storage/X6IERVU9/1752-0509-4-121.html},
  ids = {Massa2010a},
  issn = {1752-0509},
  journaltitle = {BMC Systems Biology},
  keywords = {⚠️ Invalid DOI,Animals,Computational Biology,Computational Biology: methods,Computer Graphics,Gene Expression Profiling,Humans,Mice,Models; Genetic,Receptor; Epidermal Growth Factor,Receptor; Epidermal Growth Factor: genetics,Receptor; Epidermal Growth Factor: metabolism,Receptors; Antigen; B-Cell,Receptors; Antigen; B-Cell: genetics,Receptors; Antigen; B-Cell: metabolism,Signal Transduction},
  number = {1},
  pages = {121},
  shortjournal = {BMC Systems Biology},
  title = {Gene Set Analysis Exploiting the Topology of a Pathway},
  volume = {4}
}

@article{Mastrokolias2012,
  abstract = {Transcriptome analysis is of great interest in clinical research, where significant differences between individuals can be translated into biomarkers of disease. Although next generation sequencing provides robust, comparable and highly informative expression profiling data, with several million of tags per blood sample, reticulocyte globin transcripts can constitute up to 76\% of total mRNA compromising the detection of low abundant transcripts. We have removed globin transcripts from 6 human whole blood RNA samples with a human globin reduction kit and compared them with the same non-reduced samples using deep Serial Analysis of Gene Expression.},
  author = {Mastrokolias, Anastasios and den Dunnen, Johan T and van Ommen, Gertjan B and 't Hoen, Peter a C and van Roon-Mom, Willeke M C},
  date = {2012-01},
  doi = {10/fx3s8g},
  eprint = {22257641},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Mastrokolias et al. - 2012 - Increased sensitivity of next generation sequencin.pdf},
  issn = {1471-2164},
  journaltitle = {BMC genomics},
  keywords = {Adult,Aged,Female,Gene Expression Profiling,Globins,Globins: genetics,Globins: metabolism,Humans,Male,Middle Aged,RNA,RNA: blood,RNA: genetics,Sensitivity and Specificity,Sequence Analysis; RNA,Signal Transduction},
  number = {1},
  options = {useprefix=true},
  pages = {28},
  title = {Increased Sensitivity of next Generation Sequencing-Based Expression Profiling after Globin Reduction in Human Blood {{RNA}}.},
  volume = {13}
}

@article{McCall2010,
  abstract = {Robust multiarray analysis (RMA) is the most widely used preprocessing algorithm for Affymetrix and Nimblegen gene expression microarrays. RMA performs background correction, normalization, and summarization in a modular way. The last 2 steps require multiple arrays to be analyzed simultaneously. The ability to borrow information across samples provides RMA various advantages. For example, the summarization step fits a parametric model that accounts for probe effects, assumed to be fixed across arrays, and improves outlier detection. Residuals, obtained from the fitted model, permit the creation of useful quality metrics. However, the dependence on multiple arrays has 2 drawbacks: (1) RMA cannot be used in clinical settings where samples must be processed individually or in small batches and (2) data sets preprocessed separately are not comparable. We propose a preprocessing algorithm, frozen RMA (fRMA), which allows one to analyze microarrays individually or in small batches and then combine the data for analysis. This is accomplished by utilizing information from the large publicly available microarray databases. In particular, estimates of probe-specific effects and variances are precomputed and frozen. Then, with new data sets, these are used in concert with information from the new arrays to normalize and summarize the data. We find that fRMA is comparable to RMA when the data are analyzed as a single batch and outperforms RMA when analyzing multiple batches. The methods described here are implemented in the R package fRMA and are currently available for download from the software section of http://rafalab.jhsph.edu.},
  author = {Mccall, Matthew N. and Bolstad, Benjamin M. and a. Irizarry, Rafael},
  date = {2010-04-01},
  doi = {10/bxxhbc},
  eprint = {20097884},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Mccall et al. - 2010 - Frozen robust multiarray analysis (fRMA).pdf},
  isbn = {1471210512369},
  issn = {1465-4644},
  journaltitle = {Biostatistics (Oxford, England)},
  keywords = {Affymetrix,Algorithms,ArrayExpress,Biometry,Biometry: methods,Databases,Genetic,GEO,Humans,Internet,interventions for carers,Microarray,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,palliative care,Preprocessing,Single-array,Software,sytematic review},
  number = {2},
  pages = {242-53},
  title = {Frozen Robust Multiarray Analysis ({{fRMA}})},
  volume = {11}
}

@article{McCall2011,
  abstract = {Robust multiarray analysis (RMA) is the most widely used preprocessing algorithm for Affymetrix and Nimblegen gene expression microarrays. RMA performs background correction, normalization, and summarization in a modular way. The last 2 steps require multiple arrays to be analyzed simultaneously. The ability to borrow information across samples provides RMA various advantages. For example, the summarization step fits a parametric model that accounts for probe effects, assumed to be fixed across arrays, and improves outlier detection. Residuals, obtained from the fitted model, permit the creation of useful quality metrics. However, the dependence on multiple arrays has 2 drawbacks: (1) RMA cannot be used in clinical settings where samples must be processed individually or in small batches and (2) data sets preprocessed separately are not comparable. We propose a preprocessing algorithm, frozen RMA (fRMA), which allows one to analyze microarrays individually or in small batches and then combine the data for analysis. This is accomplished by utilizing information from the large publicly available microarray databases. In particular, estimates of probe-specific effects and variances are precomputed and frozen. Then, with new data sets, these are used in concert with information from the new arrays to normalize and summarize the data. We find that fRMA is comparable to RMA when the data are analyzed as a single batch and outperforms RMA when analyzing multiple batches. The methods described here are implemented in the R package fRMA and are currently available for download from the software section of http://rafalab.jhsph.edu.},
  author = {McCall, Matthew N and Irizarry, Rafael A},
  date = {2011-12-16},
  doi = {10/fv34wn},
  eprint = {20097884},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/McCall and Irizarry - 2011 - Thawing Frozen Robust Multi-array Analysis (fRMA).pdf},
  issn = {1471-2105},
  journaltitle = {BMC Bioinformatics},
  keywords = {interventions for carers,palliative care,sytematic review},
  number = {1},
  pages = {369},
  title = {Thawing {{Frozen Robust Multi}}-Array {{Analysis}} ({{fRMA}})},
  volume = {12}
}

@article{McCall2014,
  abstract = {MOTIVATION: Quantitative real-time PCR (qPCR) is one of the most widely used methods to measure gene expression. Despite extensive research in qPCR laboratory protocols, normalization and statistical analysis, little attention has been given to qPCR non-detects-those reactions failing to produce a minimum amount of signal.\textbackslash{}n\textbackslash{}nRESULTS: We show that the common methods of handling qPCR non-detects lead to biased inference. Furthermore, we show that non-detects do not represent data missing completely at random and likely represent missing data occurring not at random. We propose a model of the missing data mechanism and develop a method to directly model non-detects as missing data. Finally, we show that our approach results in a sizeable reduction in bias when estimating both absolute and differential gene expression. Availability and implementation: The proposed algorithm is implemented in the R package, nondetects. This package also contains the raw data for the three example datasets used in this manuscript. The package is freely available at http://mnmccall.com/software and as part of the Bioconductor project.\textbackslash{}n\textbackslash{}nCONTACT: mccallm@gmail.com.},
  author = {McCall, Matthew N. and McMurray, Helene R. and Land, Hartmut and Almudevar, Anthony},
  date = {2014-08-15},
  doi = {10/f6jv47},
  eprint = {24764462},
  eprinttype = {pmid},
  issn = {1367-4803},
  journaltitle = {Bioinformatics},
  keywords = {\#nosource},
  number = {16},
  pages = {2310-2316},
  title = {On Non-Detects in {{qPCR}} Data},
  volume = {30}
}

@article{Mccarthy2009,
  abstract = {MOTIVATION: Statistical methods are used to test for the differential expression of genes in microarray experiments. The most widely used methods successfully test whether the true differential expression is different from zero, but give no assurance that the differences found are large enough to be biologically meaningful. RESULTS: We present a method, t-tests relative to a threshold (TREAT), that allows researchers to test formally the hypothesis (with associated p-values) that the differential expression in a microarray experiment is greater than a given (biologically meaningful) threshold. We have evaluated the method using simulated data, a dataset from a quality control experiment for microarrays and data from a biological experiment investigating histone deacetylase inhibitors. When the magnitude of differential expression is taken into account, TREAT improves upon the false discovery rate of existing methods and identifies more biologically relevant genes. AVAILABILITY: R code implementing our methods is contributed to the software package limma available at http://www.bioconductor.org.},
  author = {Mccarthy, Davis J. and Smyth, Gordon K.},
  date = {2009},
  doi = {10/bmnsw4},
  eprint = {19176553},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Mccarthy and Smyth - 2009 - Testing significance relative to a fold-change thr.pdf},
  isbn = {1367-4811 (Electronic)\textbackslash{}r1367-4803 (Linking)},
  issn = {13674803},
  journaltitle = {Bioinformatics},
  number = {6},
  pages = {765-771},
  title = {Testing Significance Relative to a Fold-Change Threshold Is a {{TREAT}}},
  volume = {25}
}

@article{McCarthy2012,
  abstract = {A flexible statistical framework is developed for the analysis of read counts from RNA-Seq gene expression studies. It provides the ability to analyse complex experiments involving multiple treatment conditions and blocking variables while still taking full account of biological variation. Biological variation between RNA samples is estimated separately from the technical variation associated with sequencing technologies. Novel empirical Bayes methods allow each gene to have its own specific variability, even when there are relatively few biological replicates from which to estimate such variability. The pipeline is implemented in the edgeR package of the Bioconductor project. A case study analysis of carcinoma data demonstrates the ability of generalized linear model methods (GLMs) to detect differential expression in a paired design, and even to detect tumour-specific expression changes. The case study demonstrates the need to allow for gene-specific variability, rather than assuming a common dispersion across genes or a fixed relationship between abundance and variability. Genewise dispersions de-prioritize genes with inconsistent results and allow the main analysis to focus on changes that are consistent between biological replicates. Parallel computational approaches are developed to make non-linear model fitting faster and more reliable, making the application of GLMs to genomic data more convenient and practical. Simulations demonstrate the ability of adjusted profile likelihood estimators to return accurate estimators of biological variability in complex situations. When variation is gene-specific, empirical Bayes estimators provide an advantageous compromise between the extremes of assuming common dispersion or separate genewise dispersion. The methods developed here can also be applied to count data arising from DNA-Seq applications, including ChIP-Seq for epigenetic marks and DNA methylation analyses.},
  author = {McCarthy, Davis J and Chen, Yunshun and Smyth, Gordon K},
  date = {2012-05},
  doi = {10/fxwbrf},
  eprint = {22287627},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/McCarthy et al. - 2012 - Differential expression analysis of multifactor RN.pdf},
  issn = {1362-4962},
  journaltitle = {Nucleic acids research},
  number = {10},
  pages = {4288-97},
  title = {Differential Expression Analysis of Multifactor {{RNA}}-{{Seq}} Experiments with Respect to Biological Variation.},
  volume = {40}
}

@article{McDonald2016,
  abstract = {The transcriptional repressor Bcl-6 is linked to the development of both CD4(+) T follicular helper (TFH) and central memory T (TCM) cells. Here, we demonstrate that in response to decreased IL-2 signalling, T helper 1 (TH1) cells upregulate Bcl-6 and co-initiate TFH- and TCM-like gene programs, including expression of the cytokine receptors IL-6R{$\alpha$} and IL-7R. Exposure of this potentially bi-potent cell population to IL-6 favours the TFH gene program, whereas IL-7 signalling represses TFH-associated genes including Bcl6 and Cxcr5, but not the TCM-related genes Klf2 and Sell. Mechanistically, IL-7-dependent activation of STAT5 contributes to Bcl-6 repression. Importantly, antigen-specific IL-6R{$\alpha$}(+)IL-7R(+) CD4(+) T cells emerge from the effector population at late time points post influenza infection. These data support a novel role for IL-7 in the repression of the TFH gene program and evoke a divergent regulatory mechanism by which post-effector TH1 cells may contribute to long-term cell-mediated and humoral immunity.},
  author = {McDonald, Paul W. and Read, Kaitlin A. and Baker, Chandra E. and Anderson, Ashlyn E. and Powell, Michael D. and Ballesteros-Tato, Andr\'e and Oestreich, Kenneth J.},
  date = {2016},
  doi = {10/ggcxk3},
  eprint = {26743592},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/McDonald et al. - 2016 - IL-7 signalling represses Bcl-6 and the TFH gene p.pdf},
  issn = {2041-1723},
  journaltitle = {Nature communications},
  pages = {10285},
  title = {{{IL}}-7 Signalling Represses {{Bcl}}-6 and the {{TFH}} Gene Program.},
  volume = {7}
}

@article{McGill1978,
  abstract = {[Box plots display batches of data. Five values from a set of data are conventionally used; the extremes, the upper and lower hinges (quartiles), and the median. Such plots are becoming a widely used tool in exploratory data analysis and in preparing visual summaries for statisticians and nonstatisticians alike. Three variants of the basic display, devised by the authors, are described. The first visually incorporates a measure of group size; the second incorporates an indication of rough significance of differences between medians; the third combines the features of the first two. These techniques are displayed by examples.]},
  author = {McGill, Robert and Tukey, John W and Larsen, Wayne A},
  date = {1978},
  doi = {10/dsvtxr},
  eprint = {2683468},
  eprinttype = {jstor},
  issn = {00031305},
  journaltitle = {The American Statistician},
  keywords = {\#nosource},
  number = {1},
  pages = {12-16},
  title = {Variations of {{Box Plots}}},
  volume = {32}
}

@article{McLean2010,
  abstract = {We developed the Genomic Regions Enrichment of Annotations Tool (GREAT) to analyze the functional significance of cis-regulatory regions identified by localized measurements of DNA binding events across an entire genome. Whereas previous methods took into account only binding proximal to genes, GREAT is able to properly incorporate distal binding sites and control for false positives using a binomial test over the input genomic regions. GREAT incorporates annotations from 20 ontologies and is available as a web application. Applying GREAT to data sets from chromatin immunoprecipitation coupled with massively parallel sequencing (ChIP-seq) of multiple transcription-associated factors, including SRF, NRSF, GABP, Stat3 and p300 in different developmental contexts, we recover many functions of these factors that are missed by existing gene-based tools, and we generate testable hypotheses. The utility of GREAT is not limited to ChIP-seq, as it could also be applied to open chromatin, localized epigenomic markers and similar functional data sets, as well as comparative genomics sets. \textcopyright{} 2010 Nature America, Inc. All rights reserved.},
  author = {McLean, Cory Y. and Bristor, Dave and Hiller, Michael and Clarke, Shoa L. and Schaar, Bruce T. and Lowe, Craig B. and Wenger, Aaron M. and Bejerano, Gill},
  date = {2010},
  doi = {10/cz6j9d},
  eprint = {20436461},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/McLean et al. - 2010 - GREAT improves functional interpretation of cis-re.pdf},
  isbn = {1546-1696 (Electronic)\textbackslash{}n1087-0156 (Linking)},
  issn = {10870156},
  journaltitle = {Nature Biotechnology},
  number = {5},
  pages = {495-501},
  title = {{{GREAT}} Improves Functional Interpretation of Cis-Regulatory Regions},
  volume = {28}
}

@article{Meyer2011,
  abstract = {Progression through mitosis requires the sequential ubiquitination of cell cycle regulators by the anaphase-promoting complex, resulting in their proteasomal degradation. Although several mechanisms contribute to APC/C regulation during mitosis, the APC/C is able to discriminate between its many substrates by exploiting differences in the processivity of ubiquitin chain assembly. Here, we discuss how the APC/C achieves processive ubiquitin chain formation to trigger the sequential degradation of cell cycle regulators during mitosis.},
  author = {Meyer, Hermann-Josef and Rape, Michael},
  date = {2011-08},
  doi = {10/bk9kr5},
  eprint = {21477659},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Meyer and Rape - 2011 - Processive ubiquitin chain formation by the anapha.pdf},
  issn = {1096-3634},
  journaltitle = {Seminars in cell \& developmental biology},
  keywords = {Animals,Cell Nucleus,Cell Nucleus: genetics,Cell Nucleus: metabolism,Gene Expression Regulation; Developmental,Humans,Mammals,Mitosis,Proteasome Endopeptidase Complex,Proteasome Endopeptidase Complex: genetics,Proteasome Endopeptidase Complex: metabolism,Protein Binding,Protein Binding: genetics,Proteolysis,Substrate Specificity,Ubiquitin,Ubiquitin-Conjugating Enzymes,Ubiquitin-Conjugating Enzymes: genetics,Ubiquitin-Conjugating Enzymes: metabolism,Ubiquitin-Protein Ligase Complexes,Ubiquitin-Protein Ligase Complexes: genetics,Ubiquitin-Protein Ligase Complexes: metabolism,Ubiquitin: genetics,Ubiquitin: metabolism,Ubiquitination},
  number = {6},
  pages = {544-50},
  title = {Processive Ubiquitin Chain Formation by the Anaphase-Promoting Complex.},
  volume = {22}
}

@article{Mir2010,
  abstract = {Kinases execute pivotal cellular functions and are therefore widely investigated as potential targets in anticancer treatment. Here we analyze the kinase gene expression profiles of various tumor types and reveal the wee1 kinase to be overexpressed in glioblastomas. We demonstrate that WEE1 is a major regulator of the G(2) checkpoint in glioblastoma cells. Inhibition of WEE1 by siRNA or small molecular compound in cells exposed to DNA damaging agents results in abrogation of the G(2) arrest, premature termination of DNA repair, and cell death. Importantly, we show that the small-molecule inhibitor of WEE1 sensitizes glioblastoma to ionizing radiation in vivo. Our results suggest that inhibition of WEE1 kinase holds potential as a therapeutic approach in treatment of glioblastoma.},
  author = {Mir, Shahryar E and De Witt Hamer, Philip C and Krawczyk, Przemek M and Balaj, Leonora and Claes, An and Niers, Johanna M and Van Tilborg, Angela A.G. and Zwinderman, Aeilko H and Geerts, Dirk and Kaspers, Gertjan J.L. and Peter Vandertop, W and Cloos, Jacqueline and Tannous, Bakhos A and Wesseling, Pieter and a Aten, Jacob and Noske, David P and Van Noorden, Cornelis J.F. and W\"urdinger, Thomas},
  date = {2010-09-14},
  doi = {10/fczt9r},
  eprint = {20832752},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Mir et al. - 2010 - In Silico Analysis of Kinase Expression Identifies.pdf},
  issn = {15356108},
  journaltitle = {Cancer Cell},
  keywords = {Amplified Fragment Length Polymorphism Analysis,Animal,Animals,Cell Cycle,Cell Cycle Proteins,Cell Cycle Proteins: antagonists & inhibitors,Cell Cycle Proteins: biosynthesis,Cell Cycle Proteins: genetics,Cell Cycle Proteins: physiology,Cell Cycle: drug effects,Cell Cycle: genetics,Disease Models,DNA Damage,DNA Repair,G2 Phase,G2 Phase: physiology,Gene Expression Profiling,Glioblastoma,Glioblastoma: drug therapy,Glioblastoma: enzymology,Glioblastoma: genetics,Glioblastoma: pathology,Humans,Mice,Microarray Analysis,Mitosis,Mitosis: physiology,Nuclear Proteins,Nuclear Proteins: antagonists & inhibitors,Nuclear Proteins: biosynthesis,Nuclear Proteins: genetics,Nuclear Proteins: physiology,Nude,Protein-Tyrosine Kinases,Protein-Tyrosine Kinases: antagonists & inhibitors,Protein-Tyrosine Kinases: biosynthesis,Protein-Tyrosine Kinases: genetics,Protein-Tyrosine Kinases: physiology,Pyrimidines,Pyrimidines: pharmacology,Tumor Suppressor Protein p53,Tumor Suppressor Protein p53: genetics,Tumor Suppressor Protein p53: metabolism},
  number = {3},
  pages = {244-257},
  title = {In {{Silico Analysis}} of {{Kinase Expression Identifies WEE1}} as a {{Gatekeeper}} against {{Mitotic Catastrophe}} in {{Glioblastoma}}},
  volume = {18}
}

@article{Mo2012,
  abstract = {Chromatin immunoprecipitation followed by next generation sequencing (ChIP-seq) is a powerful technique that is being used in a wide range of biological studies including genome-wide measurements of protein-DNA interactions, DNA methylation, and histone modifications. The vast amount of data and biases introduced by sequencing and/or genome mapping pose new challenges and call for effective methods and fast computer programs for statistical analysis. To systematically model ChIP-seq data, we build a dynamic signal profile for each chromosome and then model the profile using a fully Bayesian hidden Ising model. The proposed model naturally takes into account spatial dependency and global and local distributions of sequence tags. It can be used for one-sample and two-sample analyses. Through model diagnosis, the proposed method can detect falsely enriched regions caused by sequencing and/or mapping errors, which is usually not offered by the existing hypothesis-testing-based methods. The proposed method is illustrated using 3 transcription factor (TF) ChIP-seq data sets and 2 mixed ChIP-seq data sets and compared with 4 popular and/or well-documented methods: MACS, CisGenome, BayesPeak, and SISSRs. The results indicate that the proposed method achieves equivalent or higher sensitivity and spatial resolution in detecting TF binding sites with false discovery rate at a much lower level.},
  author = {Mo, Qianxing},
  date = {2012-01},
  doi = {10/bwnx7s},
  eprint = {21914728},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Mo - 2012 - A fully Bayesian hidden Ising model for ChIP-seq d.pdf},
  issn = {1468-4357},
  journaltitle = {Biostatistics (Oxford, England)},
  keywords = {chip-seq,ising model,markov random fields,massively parallel sequencing,next generation},
  number = {1},
  pages = {113-28},
  title = {A Fully {{Bayesian}} Hidden {{Ising}} Model for {{ChIP}}-Seq Data Analysis.},
  volume = {13}
}

@article{Moradkhani2009,
  abstract = {The human alpha-globin genes are paralogues, sharing a high degree of DNA sequence similarity and producing an identical alpha-globin chain. Over half of the alpha-globin structural variants reported to date are only characterized at the amino acid level. It is likely that a fraction of these variants, with phenotypes differing from one observation to another, may be due to the same mutation but on a different alpha-globin gene. There have been very few previous examples of hemoglobin variants that can be found at both HBA1 and HBA2 genes. Here, we report the results of a systematic multicenter study in a large multiethnic population to identify such variants and to analyze their differences from a functional and evolutionary perspective. We identified 14 different Hb variants resulting from identical mutations on either one of the two human alpha-globin paralogue genes. We also showed that the average percentage of hemoglobin variants due to a HBA2 gene mutation (alpha2) is higher than the percentage of hemoglobin variants due to the same HBA1 gene mutation (alpha1) and that the alpha2/alpha1 ratio varied between variants. These alpha-globin chain variants have most likely occurred via recurrent mutations, gene conversion events, or both. Based on these data, we propose a nomenclature for hemoglobin variants that fall into this category.},
  author = {Moradkhani, Kamran and Pr\'ehu, Claude and Old, John and Henderson, Shirley and Balamitsa, Vera and Luo, Hong Yuan and Poon, Man Chiu and Chui, David H K and Wajcman, Henri and Patrinos, George P.},
  date = {2009},
  doi = {10/b5z3jq},
  eprint = {18923834},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Moradkhani et al. - 2009 - Mutations in the paralogous human α-globin genes y.pdf},
  isbn = {09395555 (ISSN)},
  issn = {09395555},
  journaltitle = {Annals of Hematology},
  keywords = {Gene conversion,Hemoglobin variants,Mutations,Paralogues,α-Globin genes},
  number = {6},
  pages = {535-543},
  title = {Mutations in the Paralogous Human {$\alpha$}-Globin Genes Yielding Identical Hemoglobin Variants},
  volume = {88}
}

@article{Moreno-Hagelsieb2008,
  abstract = {The analyses of the increasing number of genome sequences requires shortcuts for the detection of orthologs, such as Reciprocal Best Hits (RBH), where orthologs are assumed if two genes each in a different genome find each other as the best hit in the other genome. Two BLAST options seem to affect alignment scores the most, and thus the choice of a best hit: the filtering of low information sequence segments and the algorithm used to produce the final alignment. Thus, we decided to test whether such options would help better detect orthologs.},
  author = {Moreno-Hagelsieb, Gabriel and Latimer, Kristen},
  date = {2008-02-01},
  doi = {10/bc4ssc},
  eprint = {18042555},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Moreno-Hagelsieb and Latimer - 2008 - Choosing BLAST options for better detection of ort.pdf},
  issn = {1367-4811},
  journaltitle = {Bioinformatics (Oxford, England)},
  keywords = {Base Sequence,Chromosome Mapping,Chromosome Mapping: methods,Database Management Systems,Databases; Genetic,DNA; Bacterial,DNA; Bacterial: genetics,Escherichia coli,Escherichia coli: genetics,Molecular Sequence Data,Proteins,Proteins: genetics,Sequence Alignment,Sequence Alignment: methods,Sequence Analysis; DNA,Sequence Analysis; DNA: methods,Sequence Homology; Nucleic Acid},
  number = {3},
  pages = {319-24},
  title = {Choosing {{BLAST}} Options for Better Detection of Orthologs as Reciprocal Best Hits.},
  volume = {24}
}

@article{Mostafavi2013,
  abstract = {Transcriptomic assays that measure expression levels are widely used to study the manifestation of environmental or genetic variations in cellular processes. RNA-sequencing in particular has the potential to considerably improve such understanding because of its capacity to assay the entire transcriptome, including novel transcriptional events. However, as with earlier expression assays, analysis of RNA-sequencing data requires carefully accounting for factors that may introduce systematic, confounding variability in the expression measurements, resulting in spurious correlations. Here, we consider the problem of modeling and removing the effects of known and hidden confounding factors from RNA-sequencing data. We describe a unified residual framework that encapsulates existing approaches, and using this framework, present a novel method, HCP (Hidden Covariates with Prior). HCP uses a more informed assumption about the confounding factors, and performs as well or better than existing approaches while having a much lower computational cost. Our experiments demonstrate that accounting for known and hidden factors with appropriate models improves the quality of RNA-sequencing data in two very different tasks: detecting genetic variations that are associated with nearby expression variations (cis-eQTLs), and constructing accurate co-expression networks.},
  author = {Mostafavi, Sara and Battle, Alexis and Zhu, Xiaowei and Urban, Alexander E. and Levinson, Douglas and Montgomery, Stephen B. and Koller, Daphne},
  date = {2013-01-18},
  doi = {10/ggcxk5},
  editor = {Benos, Panayiotis V.},
  eprint = {23874524},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Mostafavi et al. - 2013 - Normalizing RNA-Sequencing Data by Modeling Hidden.pdf},
  issn = {1932-6203},
  journaltitle = {PloS one},
  number = {7},
  pages = {e68141},
  title = {Normalizing {{RNA}}-{{Sequencing Data}} by {{Modeling Hidden Covariates}} with {{Prior Knowledge}}.},
  volume = {8}
}

@article{Mukherjee2003,
  abstract = {A statistical methodology for estimating dataset size requirements for classifying microarray data using learning curves is introduced. The goal is to use existing classification results to estimate dataset size requirements for future classification experiments and to evaluate the gain in accuracy and significance of classifiers built with additional data. The method is based on fitting inverse power-law models to construct empirical learning curves. It also includes a permutation test procedure to assess the statistical significance of classification performance for a given dataset size. This procedure is applied to several molecular classification problems representing a broad spectrum of levels of complexity.},
  author = {Mukherjee, Sayan and Tamayo, Pablo and Rogers, Simon and Rifkin, Ryan and Engle, Anna and Campbell, Colin and Golub, Todd R and Mesirov, Jill P},
  date = {2003-01},
  doi = {10/c88n8f},
  eprint = {12804087},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Mukherjee et al. - 2003 - Estimating dataset size requirements for classifyi.pdf},
  issn = {1066-5277},
  journaltitle = {Journal of computational biology : a journal of computational molecular cell biology},
  keywords = {Algorithms,Computational Biology,Computational Biology: methods,Computer Simulation,Gene Expression Profiling,Gene Expression Profiling: classification,Gene Expression Profiling: methods,Humans,Models; Molecular,Neoplasms,Neoplasms: classification,Neoplasms: genetics,Neoplasms: metabolism,Oligonucleotide Array Sequence Analysis},
  number = {2},
  pages = {119-42},
  title = {Estimating Dataset Size Requirements for Classifying {{DNA}} Microarray Data.},
  volume = {10}
}

@book{Murphy2012,
  abstract = {This is a textbook that introduces the immune system in all its aspects to undergraduates, and also provides a treatment of the subject that is comprehensive enough to be useful to graduate students interested in research, and to medical students focused on clinical applications. The Eighth Edition has been thoroughly revised and updated.},
  author = {Murphy, Kenneth and Travers, Paul and Walport, Mark and Janeway, Charles},
  date = {2012},
  edition = {8th},
  ids = {Murphy2012},
  isbn = {978-0-8153-4243-4 978-0-8153-4530-5},
  keywords = {\#nosource},
  langid = {english},
  location = {{New York}},
  note = {OCLC: 733935898},
  publisher = {{Garland Science}},
  title = {Janeway's Immunobiology}
}

@article{Mutz2012,
  abstract = {Up to date research in biology, biotechnology, and medicine requires fast genome and transcriptome analysis technologies for the investigation of cellular state, physiology, and activity. Here, microarray technology and next generation sequencing of transcripts (RNA-Seq) are state of the art. Since microarray technology is limited towards the amount of RNA, the quantification of transcript levels and the sequence information, RNA-Seq provides nearly unlimited possibilities in modern bioanalysis. This chapter presents a detailed description of next-generation sequencing (NGS), describes the impact of this technology on transcriptome analysis and explains its possibilities to explore the modern RNA world.},
  author = {Mutz, KO and Heilkenbrinker, Alexandra and L\"onne, Maren},
  date = {2012-02},
  doi = {10/gfznt9},
  journaltitle = {Current Opinion in \ldots{}},
  keywords = {\#nosource},
  number = {1},
  pages = {22-30},
  title = {Transcriptome Analysis Using Next-Generation Sequencing},
  volume = {24}
}

@article{Nellore2015,
  abstract = {RNA sequencing (RNA-seq) experiments now span hundreds to thousands of samples. A source of frustration for investigators analyzing a given dataset is the inability to rapidly and reproducibly align its samples jointly. Current spliced alignment software is designed to analyze each sample separately. Consequently, no information is gained from analyzing multiple samples together, and it is difficult to reproduce the exact analysis without access to original computing resources. We describe Rail-RNA, a cloud-enabled spliced aligner that analyzes many samples at once. Rail-RNA eliminates redundant work across samples, making it more efficient as samples are added. For many samples, Rail-RNA is more accurate than annotation-assisted aligners. We use Rail-RNA to align 666 RNA-seq samples from the GEUVADIS project on Amazon Web Services in 12 hours for US\$0.69 per sample. Rail-RNA produces alignments and base-resolution bigWig coverage files, ready for use with downstream packages for reproducible statistical analysis. We identify 290,416 expressed regions in the GEUVADIS samples, including 21,224 that map to intergenic sequence. We show that these regions show consistent patterns of variation across populations and with respect to known technological confounders. We identify expressed regions in the GEUVADIS samples and show that both annotated and unannotated (novel) expressed regions exhibit consistent patterns of variation across populations and with respect to known confounders. Rail-RNA is open-source software available at http://rail.bio .},
  author = {Nellore, Abhinav and Collado-Torres, Leonardo and Jaffe, Andrew E and Morton, James and Pritt, Jacob and Alquicira-Hern\'andez, Jos\'e and Leek, Jeffrey T and Langmead, Ben},
  date = {2015},
  doi = {10/ggcxk6},
  file = {/Users/ryan/Documents/Zotero Library/Nellore et al. - 2015 - Rail-RNA Scalable analysis of RNA-seq splicing an.pdf},
  journaltitle = {bioRxiv},
  pages = {019067},
  title = {Rail-{{RNA}}: {{Scalable}} Analysis of {{RNA}}-Seq Splicing and Coverage}
}

@article{Nettleton2006,
  author = {Nettleton, Dan and Hwang, J. T. Gene and a. Caldo, Rico and Wise, Roger P.},
  date = {2006-09},
  doi = {10/c7s43v},
  file = {/Users/ryan/Documents/Zotero Library/Nettleton et al. - 2006 - Estimating the number of true null hypotheses from.pdf},
  issn = {1085-7117},
  journaltitle = {Journal of Agricultural, Biological, and Environmental Statistics},
  keywords = {false discovery rate,microarray data,multiple testing},
  number = {3},
  pages = {337-356},
  title = {Estimating the Number of True Null Hypotheses from a Histogram of p Values},
  volume = {11}
}

@article{NgocTamTran2014,
  author = {{Ngoc Tam Tran} and Huang, Chun-Hsi},
  date = {2014},
  doi = {10/ggcxk7},
  file = {/Users/ryan/Documents/Zotero Library/Ngoc Tam Tran and Huang - 2014 - Gene Expression and Gene Ontology Enrichment Analy.pdf},
  issn = {1177-6250},
  journaltitle = {Gene Regulation and Systems Biology},
  keywords = {10,2013,4137,43 doi,8 33,accepted for publication,biology 2014,chip-seq,citation,december 11,december 9,embryonic stem cell,gene expression,gene expression and gene,gene ontology,gene regulation and systems,grsb,h3k4me1,h3k4me3,h3k4me3 and h3k4me1 in,mouse liver and mouse,november 10,ontology enrichment analysis for,received,resubmitted,rna-seq,s13612,tran and huang,using chip-seq and rna-seq},
  pages = {33},
  title = {Gene {{Expression}} and {{Gene Ontology Enrichment Analysis}} for {{H3K4me3}} and {{H3K4me1}} in {{Mouse Liver}} and {{Mouse Embryonic Stem Cell Using ChIP}}-{{Seq}} and {{RNA}}-{{Seq}}}
}

@article{Nix2008,
  author = {a Nix, David and Courdy, Samir J and Boucher, Kenneth M},
  date = {2008},
  doi = {10/fgswg4},
  file = {/Users/ryan/Documents/Zotero Library/Nix et al. - 2008 - Empirical methods for controlling false positives .pdf},
  issn = {1471-2105},
  journaltitle = {BMC Bioinformatics},
  number = {1},
  pages = {523},
  title = {Empirical Methods for Controlling False Positives and Estimating Confidence in {{ChIP}}-{{Seq}} Peaks},
  volume = {9}
}

@article{Nookaew2012,
  abstract = {RNA-seq, has recently become an attractive method of choice in the studies of transcriptomes, promising several advantages compared with microarrays. In this study, we sought to assess the contribution of the different analytical steps involved in the analysis of RNA-seq data generated with the Illumina platform, and to perform a cross-platform comparison based on the results obtained through Affymetrix microarray. As a case study for our work we, used the Saccharomyces cerevisiae strain CEN.PK 113-7D, grown under two different conditions (batch and chemostat). Here, we asses the influence of genetic variation on the estimation of gene expression level using three different aligners for read-mapping (Gsnap, Stampy and TopHat) on S288c genome, the capabilities of five different statistical methods to detect differential gene expression (baySeq, Cuffdiff, DESeq, edgeR and NOISeq) and we explored the consistency between RNA-seq analysis using reference genome and de novo assembly approach. High reproducibility among biological replicates (correlation {$\geq$}0.99) and high consistency between the two platforms for analysis of gene expression levels (correlation {$\geq$}0.91) are reported. The results from differential gene expression identification derived from the different statistical methods, as well as their integrated analysis results based on gene ontology annotation are in good agreement. Overall, our study provides a useful and comprehensive comparison between the two platforms (RNA-seq and microrrays) for gene expression analysis and addresses the contribution of the different steps involved in the analysis of RNA-seq data.},
  author = {Nookaew, Intawat and Papini, Marta and Pornputtapong, Natapol and Scalcinati, Gionata and Fagerberg, Linn and Uhl\'en, Matthias and Nielsen, Jens},
  date = {2012-11-10},
  doi = {10/f2zqzt},
  eprint = {22965124},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Nookaew et al. - 2012 - A comprehensive comparison of RNA-Seq-based transc.pdf},
  issn = {1362-4962},
  journaltitle = {Nucleic Acids Research},
  number = {20},
  pages = {10084-10097},
  title = {A Comprehensive Comparison of {{RNA}}-{{Seq}}-Based Transcriptome Analysis from Reads to Differential Gene Expression and Cross-Comparison with Microarrays: A Case Study in {{Saccharomyces}} Cerevisiae},
  volume = {40}
}

@article{Northrup2011,
  abstract = {Behaviors observed at the cellular level such as development and acquisition of effector functions by immune cells result from transcriptional changes. The biochemical mediators of transcription are sequence-specific transcription factors (TFs), chromatin modifying enzymes, and chromatin, the complex of DNA and histone proteins. Covalent modification of DNA and histones, also termed epigenetic modification, influences the accessibility of target sequences for transcription factors on chromatin and the expression of linked genes required for immune functions. Genome-wide techniques such as ChIP-Seq have described the entire "cistrome" of transcription factors involved in specific developmental steps of B and T~cells and started to define specific immune responses in terms of the binding profiles of critical effectors and epigenetic modification patterns. Current data suggest that both promoters and enhancers are prepared for action at different stages of activation by epigenetic modification through distinct transcription factors in different cells.},
  author = {Northrup, Daniel L and Zhao, Keji},
  date = {2011-06-24},
  doi = {10/ct8p9d},
  eprint = {21703538},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Northrup and Zhao - 2011 - Application of ChIP-Seq and related techniques to .pdf},
  issn = {1097-4180},
  journaltitle = {Immunity},
  keywords = {Animals,Chromatin Immunoprecipitation,Chromatin Immunoprecipitation: methods,Epigenesis; Genetic,Genome,Humans,Promoter Regions; Genetic,Sequence Analysis; DNA,Sequence Analysis; DNA: methods,Transcription Factors,Transcription Factors: genetics,Transcription Factors: immunology},
  number = {6},
  pages = {830-42},
  title = {Application of {{ChIP}}-{{Seq}} and Related Techniques to the Study of Immune Function.},
  volume = {34}
}

@article{Nowicka2016,
  author = {Nowicka, Malgorzata and Robinson, Mark D.},
  date = {2016},
  doi = {10/ggcxk8},
  file = {/Users/ryan/Documents/Zotero Library/Nowicka and Robinson - 2016 - DRIMSeq a Dirichlet-multinomial framework for mul.pdf},
  issn = {2046-1402},
  journaltitle = {F1000Research},
  number = {0},
  pages = {1356},
  title = {{{DRIMSeq}}: A {{Dirichlet}}-Multinomial Framework for Multivariate Count Outcomes in Genomics},
  volume = {5}
}

@article{Nueda2007,
  abstract = {Designed microarray experiments are used to investigate the effects that controlled experimental factors have on gene expression and learn about the transcriptional responses associated with external variables. In these datasets, signals of interest coexist with varying sources of unwanted noise in a framework of (co)relation among the measured variables and with the different levels of the studied factors. Discovering experimentally relevant transcriptional changes require methodologies that take all these elements into account.},
  author = {Nueda, Mar\'ia Jos\'e and Conesa, Ana and a Westerhuis, Johan and Hoefsloot, Huub C J and Smilde, Age K and Tal\'on, Manuel and Ferrer, Alberto},
  date = {2007-07-15},
  doi = {10/bzmdpv},
  eprint = {17519250},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Nueda et al. - 2007 - Discovering gene expression patterns in time cours.pdf},
  issn = {1367-4811},
  journaltitle = {Bioinformatics (Oxford, England)},
  keywords = {Algorithms,Analysis of Variance,Computational Biology,Computational Biology: methods,Computer Simulation,Data Interpretation; Statistical,Gene Expression Profiling,Gene Expression Profiling: methods,Models; Genetic,Models; Statistical,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,Principal Component Analysis,Time Factors,Transcription; Genetic},
  number = {14},
  pages = {1792-800},
  title = {Discovering Gene Expression Patterns in Time Course Microarray Experiments by {{ANOVA}}-{{SCA}}.},
  volume = {23}
}

@report{NuGEN2010,
  author = {{NuGEN}},
  date = {2010},
  file = {/Users/ryan/Documents/Zotero Library/NuGEN - 2010 - Performance verification of the automated NuGEN Ov.pdf},
  title = {Performance Verification of the Automated {{NuGEN Ovation Whole Blood Solution}}},
  url = {http://www.nugeninc.com/nugen/?LinkServID=89366653-85CF-44AC-80672BBD775B0170}
}

@article{Nygaard2016,
  abstract = {Removal of, or adjustment for, batch effects or center differences is generally required when such effects are present in data. In particular, when preparing microarray gene expression data from multiple cohorts, array platforms, or batches for later analyses, batch effects can have confounding effects, inducing spurious differences between study groups. Many methods and tools exist for removing batch effects from data. However, when study groups are not evenly distributed across batches, actual group differences may induce apparent batch differences, in which case batch adjustments may bias, usually deflate, group differences. Some tools therefore have the option of preserving the difference between study groups, e.g. using a two-way ANOVA model to simultaneously estimate both group and batch effects. Unfortunately, this approach may systematically induce incorrect group differences in downstream analyses when groups are distributed between the batches in an unbalanced manner. The scientific community seems to be largely unaware of how this approach may lead to false discoveries.},
  author = {Nygaard, Vegard and R\o{}dland, Einar Andreas and Hovig, Eivind},
  date = {2016-01-01},
  doi = {10/gb87st},
  eprint = {26272994},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Nygaard et al. - 2016 - Methods that remove batch effects while retaining .pdf},
  issn = {1468-4357},
  journaltitle = {Biostatistics},
  keywords = {batch effects,Batch effects,data normalization,Data normalization,microarrays,Microarrays,reproducible research,Reproducible research.},
  number = {1},
  pages = {29-39},
  title = {Methods That Remove Batch Effects While Retaining Group Differences May Lead to Exaggerated Confidence in Downstream Analyses},
  volume = {17}
}

@article{OHara2010,
  author = {O'Hara, Robert B. and Kotze, D. Johan},
  date = {2010-03-24},
  doi = {10/cjnz7r},
  file = {/Users/ryan/Documents/Zotero Library/O’Hara and Kotze - 2010 - Do not log-transform count data.pdf},
  issn = {2041210X},
  journaltitle = {Methods in Ecology and Evolution},
  number = {2},
  pages = {118-122},
  title = {Do Not Log-Transform Count Data},
  volume = {1}
}

@article{Otto2011a,
  abstract = {Second-generation sequencing technologies have made large-scale sequencing projects commonplace. However, making use of these datasets often requires gene function to be ascribed genome wide. Although tool development has kept pace with the changes in sequence production, for tasks such as mapping, de novo assembly or visualization, genome annotation remains a challenge. We have developed a method to rapidly provide accurate annotation for new genomes using previously annotated genomes as a reference. The method, implemented in a tool called RATT (Rapid Annotation Transfer Tool), transfers annotations from a high-quality reference to a new genome on the basis of conserved synteny. We demonstrate that a Mycobacterium tuberculosis genome or a single 2.5 Mb chromosome from a malaria parasite can be annotated in less than five minutes with only modest computational resources. RATT is available at http://ratt.sourceforge.net.},
  author = {Otto, Thomas D and Dillon, Gary P and Degrave, Wim S and Berriman, Matthew},
  date = {2011-05},
  doi = {10/bxvbxp},
  eprint = {21306991},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Otto et al. - 2011 - RATT Rapid Annotation Transfer Tool..pdf},
  issn = {1362-4962},
  journaltitle = {Nucleic acids research},
  keywords = {Algorithms,Genome; Bacterial,Genome; Protozoan,Genomics,Genomics: methods,Molecular Sequence Annotation,Molecular Sequence Annotation: methods,Mycobacterium tuberculosis,Mycobacterium tuberculosis: genetics,Plasmodium berghei,Plasmodium berghei: genetics,Plasmodium chabaudi,Plasmodium chabaudi: genetics,Software},
  number = {9},
  pages = {e57},
  title = {{{RATT}}: {{Rapid Annotation Transfer Tool}}.},
  volume = {39}
}

@article{Ouyang2009,
  abstract = {Next-generation sequencing has greatly increased the scope and the resolution of transcriptional regulation study. RNA sequencing (RNA-Seq) and ChIP-Seq experiments are now generating comprehensive data on transcript abundance and on regulator-DNA interactions. We propose an approach for an integrated analysis of these data based on feature extraction of ChIP-Seq signals, principal component analysis, and regression-based component selection. Compared with traditional methods, our approach not only offers higher power in predicting gene expression from ChIP-Seq data but also provides a way to capture cooperation among regulators. In mouse embryonic stem cells (ESCs), we find that a remarkably high proportion of variation in gene expression (65\%) can be explained by the binding signals of 12 transcription factors (TFs). Two groups of TFs are identified. Whereas the first group (E2f1, Myc, Mycn, and Zfx) act as activators in general, the second group (Oct4, Nanog, Sox2, Smad1, Stat3, Tcfcp2l1, and Esrrb) may serve as either activator or repressor depending on the target. The two groups of TFs cooperate tightly to activate genes that are differentially up-regulated in ESCs. In the absence of binding by the first group, the binding of the second group is associated with genes that are repressed in ESCs and derepressed upon early differentiation.},
  author = {Ouyang, Zhengqing and Zhou, Qing and Wong, Wing Hung},
  date = {2009},
  doi = {10/d4wtws},
  eprint = {19995984},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Ouyang et al. - 2009 - ChIP-Seq of transcription factors predicts absolut.pdf},
  isbn = {0027-8424},
  issn = {1091-6490},
  journaltitle = {Proceedings of the National Academy of Sciences of the United States of America},
  keywords = {Animals,Chromatin Immunoprecipitation,Embryonic Stem Cells,Embryonic Stem Cells: metabolism,Gene Expression,Gene Regulatory Networks,Mice,Transcription Factors,Transcription Factors: chemistry,Transcription Factors: genetics,Transcription Factors: metabolism},
  number = {51},
  pages = {21521-6},
  title = {{{ChIP}}-{{Seq}} of Transcription Factors Predicts Absolute and Differential Gene Expression in Embryonic Stem Cells.},
  volume = {106}
}

@inproceedings{Patel2018,
  author = {Patel, J and Heshmati, K and Rogers, A and Wali, R and Jonsson, J and Liu, C and Emery, E and Collins, D and Karzai, S and Piper, J},
  booktitle = {Poster {{Session D}}: {{Kidney}}: {{Acute Cellular Rejection}}},
  date = {2018-06-05},
  eventtitle = {2018 {{American Transplant Congress}}},
  file = {/Users/ryan/Documents/Zotero Library/Determining the Utility of Protocol Biopsies in Kidney Transplant Recipients - ATC Abstracts (2019-11-15 9_59_35 AM).html},
  keywords = {\#nosource,⛔ No DOI found},
  location = {{Seattle, WA}},
  title = {Determining the {{Utility}} of {{Protocol Biopsies}} in {{Kidney Transplant Recipients}} [Abstract].},
  url = {https://atcmeetingabstracts.com/abstract/determining-the-utility-of-protocol-biopsies-in-kidney-transplant-recipients/}
}

@article{Patro2017,
  author = {Patro, Rob and Duggal, Geet and Love, Michael I and Irizarry, Rafael A and Kingsford, Carl},
  date = {2017-04-06},
  doi = {10/gcw9f5},
  file = {/Users/ryan/Documents/Zotero Library/Patro et al. - 2017 - Salmon provides fast and bias-aware quantification.pdf},
  issn = {1548-7091},
  journaltitle = {Nature Methods},
  number = {4},
  pages = {417-419},
  title = {Salmon Provides Fast and Bias-Aware Quantification of Transcript Expression},
  volume = {14}
}

@article{Pawlikowska2014,
  abstract = {Several outlier and subgroup identification statistics (OASIS) have been proposed to discover transcriptomic features with outliers or multiple modes in expression that are indicative of distinct biological processes or subgroups. Here, we borrow ideas from the OASIS methods in the bioinformatics and statistics literature to develop the most informative spacing test (MIST) for unsupervised detection of such transcriptomic features. In an example application involving 14 cases of pediatric acute megakaryoblastic leukemia, MIST more robustly identified features that perfectly discriminate subjects according to gender or the presence of a prognostically relevant fusion-gene than did seven other OASIS methods in the analysis of RNA-seq exon expression, RNA-seq exon junction expression, and micorarray exon expression data. MIST was also effective at identifying features related to gender or molecular subtype in an example application involving 157 adult cases of acute myeloid leukemia. MIST will be freely available in the OASIS R package at http://www.stjuderesearch.org/site/depts/biostats.},
  author = {Pawlikowska, Iwona and Wu, Gang and Edmonson, Michael and Liu, Zhifa and Gruber, Tanja and Zhang, Jinghui and Pounds, Stan},
  date = {2014-01-22},
  doi = {10/f546tk},
  eprint = {24458951},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Pawlikowska et al. - 2014 - The Most Informative Spacing Test Effectively Disc.pdf},
  issn = {1367-4811},
  journaltitle = {Bioinformatics (Oxford, England)},
  pages = {1-9},
  title = {The {{Most Informative Spacing Test Effectively Discovers Biologically Relevant Outliers}} or {{Multiple Modes}} in {{Expression}}.}
}

@article{pedersenCombpSoftwareCombining2012,
  abstract = {Summary: comb-p is a command-line tool and a python library that manipulates BED files of possibly irregularly spaced P-values and (1) calculates auto-correlation, (2) combines adjacent P-values, (3) performs false discovery adjustment, (4) finds regions of enrichment (i.e. series of adjacent low P-values) and (5) assigns significance to those regions. In addition, tools are provided for visualization and assessment. We provide validation and example uses on bisulfite-seq with P-values from Fisher's exact test, tiled methylation probes using a linear model and Dam-ID for chromatin binding using moderated t-statistics. Because the library accepts input in a simple, standardized format and is unaffected by the origin of the P-values, it can be used for a wide variety of applications.},
  author = {Pedersen, B. S. and Schwartz, D. A. and Yang, I. V. and Kechris, K. J.},
  date = {2012-11-15},
  doi = {10/f4csvb},
  file = {/Users/ryan/Documents/Zotero Library/Pedersen et al. - 2012 - Comb-p software for combining, analyzing, groupin.pdf;/Users/ryan/Documents/Zotero Library/Pedersen et al. - 2012 - Comb-p software for combining, analyzing, groupin2.pdf},
  ids = {Pedersen2012a},
  issn = {1367-4803, 1460-2059},
  journaltitle = {Bioinformatics},
  langid = {english},
  number = {22},
  pages = {2986-2988},
  shortjournal = {Bioinformatics},
  shorttitle = {Comb-p},
  title = {Comb-p: Software for Combining, Analyzing, Grouping and Correcting Spatially Correlated {{P}}-Values},
  volume = {28}
}

@article{Pelz2008,
  abstract = {Microarray technology has become very popular for globally evaluating gene expression in biological samples. However, non-linear variation associated with the technology can make data interpretation unreliable. Therefore, methods to correct this kind of technical variation are critical. Here we consider a method to reduce this type of variation applied after three common procedures for processing microarray data: MAS 5.0, RMA, and dChip.},
  author = {Pelz, Carl R and Kulesz-Martin, Molly and Bagby, Grover and Sears, Rosalie C},
  date = {2008-01},
  doi = {10/dmcdfj},
  eprint = {19055840},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Pelz et al. - 2008 - Global rank-invariant set normalization (GRSN) to .pdf},
  issn = {1471-2105},
  journaltitle = {BMC bioinformatics},
  keywords = {Analysis of Variance,Artifacts,Computational Biology,Computational Biology: methods,Computer Simulation,Data Interpretation; Statistical,Databases; Genetic,Gene Expression Profiling,Gene Expression Regulation,Models; Genetic,Oligonucleotide Array Sequence Analysis,Reproducibility of Results,Signal Transduction},
  pages = {520},
  title = {Global Rank-Invariant Set Normalization ({{GRSN}}) to Reduce Systematic Distortions in Microarray Data.},
  volume = {9}
}

@article{Peng2012,
  author = {Peng, Ze and Zhao, Zhiying and Nath, Nandita and Froula, Jeff L. and Clum, Alicia and Zhang, Tao and Cheng, Jan-fang and Copeland, Alex C. and a. Pennacchio, Len and Chen, Feng},
  date = {2012-01-09},
  doi = {10/fxjznx},
  editor = {Pellegrini, Matteo},
  file = {/Users/ryan/Documents/Zotero Library/Peng et al. - 2012 - Generation of Long Insert Pairs Using a Cre-LoxP I.pdf},
  issn = {1932-6203},
  journaltitle = {PLoS ONE},
  number = {1},
  pages = {e29437},
  title = {Generation of {{Long Insert Pairs Using}} a {{Cre}}-{{LoxP Inverse PCR Approach}}},
  volume = {7}
}

@article{Peng2012a,
  abstract = {MOTIVATION: Next-generation sequencing allows us to sequence reads from a microbial environment using single-cell sequencing or metagenomic sequencing technologies. However, both technologies suffer from the problem that sequencing depth of different regions of a genome or genomes from different species are highly uneven. Most existing genome assemblers usually have an assumption that sequencing depths are even. These assemblers fail to construct correct long contigs.

RESULTS: We introduce the IDBA-UD algorithm that is based on the de Bruijn graph approach for assembling reads from single-cell sequencing or metagenomic sequencing technologies with uneven sequencing depths. Several non-trivial techniques have been employed to tackle the problems. Instead of using a simple threshold, we use multiple depthrelative thresholds to remove erroneous k-mers in both low-depth and high-depth regions. The technique of local assembly with paired-end information is used to solve the branch problem of low-depth short repeat regions. To speed up the process, an error correction step is conducted to correct reads of high-depth regions that can be aligned to highconfident contigs. Comparison of the performances of IDBA-UD and existing assemblers (Velvet, Velvet-SC, SOAPdenovo and Meta-IDBA) for different datasets, shows that IDBA-UD can reconstruct longer contigs with higher accuracy.

AVAILABILITY: The IDBA-UD toolkit is available at our website http://www.cs.hku.hk/\textasciitilde{}alse/idba\_ud},
  author = {Peng, Yu and Leung, Henry C M and Yiu, S M and Chin, Francis Y L},
  date = {2012-06-01},
  doi = {10/f3z7hv},
  eprint = {22495754},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Peng et al. - 2012 - IDBA-UD a de novo assembler for single-cell and m.pdf},
  issn = {1367-4811},
  journaltitle = {Bioinformatics (Oxford, England)},
  keywords = {Algorithms,Bacteria,Bacteria: genetics,Genome,High-Throughput Nucleotide Sequencing,Metagenomics,Metagenomics: methods,Sequence Analysis; DNA,Sequence Analysis; DNA: methods,Single-Cell Analysis,Single-Cell Analysis: methods},
  number = {11},
  pages = {1420-8},
  title = {{{IDBA}}-{{UD}}: A de Novo Assembler for Single-Cell and Metagenomic Sequencing Data with Highly Uneven Depth.},
  volume = {28}
}

@incollection{Peng2012b,
  abstract = {Combinatorial binding of transcription factors (TFs) and cofactors to specific regulatory regions of target genes in vivo is an important mechanism of transcriptional regulation. Chromatin immunoprecipitation (ChIP) is a powerful technique to detect protein binding to specific regions of target genes in vivo. However, conventional ChIP analysis for individual factors (single ChIP) does not provide information on co-occupancy of two interacting TFs on target genes, even if both bind to the same chromatin regions. Double ChIP analysis involves sequential (double) immunoprecipitation of two chromatin-binding proteins and can be used to study co-occupancy of two or more factors on specific regions of the same DNA allele. Furthermore, by including a cell type-specific protein in double-ChIP, target co-occupancy in a specific cell type can be studied even if the other partner is more widely expressed. In this chapter, we describe a detailed protocol for double ChIP analysis in mouse retinas. Using the rod-specific transcription factor NR2E3 and the cone/rod homeobox protein CRX as examples, we show that NR2E3 and CRX are co-enriched on the promoter of active Rho and Rbp3 genes in rods, but are present to a much lesser degree on the promoters of silent cone opsin genes. These results suggest a new mechanism by which rod and cone genes are differentially regulated by these transcription factors in rod photoreceptors. ?? 2013 Springer Science+Business Media, LLC.},
  author = {Peng, Guang Hua and Chen, Shiming},
  booktitle = {Methods in {{Molecular Biology}}},
  date = {2013},
  doi = {10/dd8p},
  file = {/Users/ryan/Documents/Zotero Library/Peng and Chen - 2013 - Double chromatin immunoprecipitation Analysis of .pdf},
  isbn = {978-1-62703-079-3},
  issn = {10643745},
  keywords = {Double chromatin immunoprecipitation,Retinal photoreceptors,Target co-occupancy,Transcription factor interactions},
  pages = {311-328},
  title = {Double Chromatin Immunoprecipitation: {{Analysis}} of Target Co-Occupancy of Retinal Transcription Factors},
  url = {http://link.springer.com/10.1007/978-1-62703-080-9_22},
  volume = {935}
}

@article{Peng2015,
  abstract = {The non-human primate reference transcriptome resource (NHPRTR, available online at http://nhprtr.org/) aims to generate comprehensive RNA-seq data from a wide variety of non-human primates (NHPs), from lemurs to hominids. In the 2012 Phase I of the NHPRTR project, 19 billion fragments or 3.8 terabases of transcriptome sequences were collected from pools of {$\sim$}20 tissues in 15 species and subspecies. Here we describe a major expansion of NHPRTR by adding 10.1 billion fragments of tissue-specific RNA-seq data. For this effort, we selected 11 of the original 15 NHP species and subspecies and constructed total RNA libraries for the same {$\sim$}15 tissues in each. The sequence quality is such that 88\% of the reads align to human reference sequences, allowing us to compute the full list of expression abundance across all tissues for each species, using the reads mapped to human genes. This update also includes improved transcript annotations derived from RNA-seq data for rhesus and cynomolgus macaques, two of the most commonly used NHP models and additional RNA-seq data compiled from related projects. Together, these comprehensive reference transcriptomes from multiple primates serve as a valuable community resource for genome annotation, gene dynamics and comparative functional analysis.},
  author = {Peng, Xinxia and Thierry-Mieg, Jean and Thierry-Mieg, Danielle and Nishida, Andrew and Pipes, Lenore and Bozinoski, Marjan and Thomas, Matthew J. and Kelly, Sara and Weiss, Jeffrey M. and Raveendran, Muthuswamy and Muzny, Donna and Gibbs, Richard A. and Rogers, Jeffrey and Schroth, Gary P. and Katze, Michael G. and Mason, Christopher E.},
  date = {2015-01-28},
  doi = {10/ggcxk9},
  eprint = {25392405},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Peng et al. - 2015 - Tissue-specific transcriptome sequencing analysis .pdf},
  issn = {1362-4962},
  journaltitle = {Nucleic Acids Research},
  number = {D1},
  pages = {D737-D742},
  title = {Tissue-Specific Transcriptome Sequencing Analysis Expands the Non-Human Primate Reference Transcriptome Resource ({{NHPRTR}})},
  volume = {43}
}

@report{Phipson2013,
  author = {Phipson, Belinda and Lee, Stanley and Majewski, Ian J and Alexander, Warren S},
  date = {2013},
  file = {/Users/ryan/Documents/Zotero Library/Phipson et al. - 2013 - Empirical Bayes in the presence of exceptional cas.pdf},
  keywords = {empirical bayes,gene expression,microarrays,outliers,robustness},
  pages = {1-18},
  title = {Empirical {{Bayes}} in the Presence of Exceptional Cases , with Application to Microarray Data}
}

@thesis{Phipson2013Thesis,
  abstract = {New biotechnology developments such as the microarray, and more recently, next generation sequencing, have necessitated the need for new statistical methodologies to be developed. These methods are designed to combat unique issues present in the data generated by these technologies. They provide the perfect environment for information sharing strategies, such as empirical Bayes methods, due to the large numbers of simulataneous tests performed. We explore different estimators of the proportion of true null hypotheses and develop a fast and accurate estimator which is valid for any number of p-values. This estimator is based on local false discovery rates and is used in several of the proceeding sections. Another interest is in developing robust hyper-parameter estimators in an empirical Bayes hierarchical model setting. An estimator for the prior degrees of freedom which is robust to outliers is developed using two different approaches. This has the effect that highly variable genes are unlikely to be significantly differentially expressed, as well as increasing power to detect differential expression. The second half of the thesis focuses on gaining more information from the log fold changes obtained from microarray and sequencing experiments. More accurate log fold changes are developed for microarrays and RNA sequencing data, which provide additional information for ranking top differentially expressed genes. The new measure, called predictive log fold change, arises from the posterior distribution of the log fold changes. The relationship between two gene expression profiles is quantified when the p-values obtained from testing two hypotheses are not independent. This arises when two genotypes are compared to a common control group. The method is based on separating the true biological correlation from the technical correlation of the log fold changes. The hyperparameters of the prior distribution for the log fold changes need to be estimated in order to get an estimate of the biological correlation. This is possible since we show that the two dependent moderated t statistics have a scaled multivariate t distribution. The methods developed in this thesis are tested using simulations and applied to data sets collected in collaboration with biologists at The Walter and Eliza Hall Institute of Medical Research.},
  author = {Phipson, Belinda},
  date = {2013},
  institution = {{The Walter, Eliza Hall Institute of Medical Research \& The University of Melbourne}},
  keywords = {bayesian analysis,bioinformatics,empirical bayes,gene expression,microarrays,sequencing},
  langid = {english},
  title = {Empirical Bayes Modelling of Expression Profiles and Their Associations},
  url = {http://hdl.handle.net/11343/38162}
}

@article{Phipson2016,
  abstract = {One of the most common analysis tasks in genomic research is to identify genes that are differentially expressed (DE) between experimental conditions. Empirical Bayes (EB) statistical tests using moderated genewise variances have been very effective for this purpose, especially when the number of biological replicate samples is small. The EB procedures can, however, be heavily influenced by a small number of genes with very large or very small variances. This article improves the differential expression tests by robustifying the hyperparameter estimation procedure. The robust procedure has the effect of decreasing the informativeness of the prior distribution for outlier genes while increasing its informativeness for other genes. This effect has the double benefit of reducing the chance that hypervariable genes will be spuriously identified as DE while increasing statistical power for the main body of genes. The robust EB algorithm is fast and numerically stable. The procedure allows exact small-sample null distributions for the test statistics and reduces exactly to the original EB procedure when no outlier genes are present. Simulations show that the robustified tests have similar performance to the original tests in the absence of outlier genes but have greater power and robustness when outliers are present. The article includes case studies for which the robust method correctly identifies and downweights genes associated with hidden covariates and detects more genes likely to be scientifically relevant to the experimental conditions. The new procedure is implemented in the limma software package freely available from the Bioconductor repository.},
  archivePrefix = {arXiv},
  author = {Phipson, Belinda and Lee, Stanley and Majewski, Ian J. and Alexander, Warren S. and Smyth, Gordon K.},
  date = {2016-06},
  doi = {10/gfgp3f},
  eprint = {1602.08678},
  eprinttype = {arxiv},
  file = {/Users/ryan/Documents/Zotero Library/Phipson et al. - 2016 - Robust hyperparameter estimation protects against .pdf},
  issn = {19417330},
  journaltitle = {Annals of Applied Statistics},
  keywords = {Empirical Bayes,Gene expression,Microarrays,Outliers,Robustness},
  number = {2},
  pages = {946-963},
  title = {Robust Hyperparameter Estimation Protects against Hypervariable Genes and Improves Power to Detect Differential Expression},
  volume = {10}
}

@article{Piccolo2012,
  abstract = {Gene-expression microarrays allow researchers to characterize biological phenomena in a high-throughput fashion but are subject to technological biases and inevitable variabilities that arise during sample collection and processing. Normalization techniques aim to correct such biases. Most existing methods require multiple samples to be processed in aggregate; consequently, each sample's output is influenced by other samples processed jointly. However, in personalized-medicine workflows, samples may arrive serially, so renormalizing all samples upon each new arrival would be impractical. We have developed Single Channel Array Normalization (SCAN), a single-sample technique that models the effects of probe-nucleotide composition on fluorescence intensity and corrects for such effects, dramatically increasing the signal-to-noise ratio within individual samples while decreasing variation across samples. In various benchmark comparisons, we show that SCAN performs as well as or better than competing methods yet has no dependence on external reference samples and can be applied to any single-channel microarray platform.},
  author = {Piccolo, Stephen R and Sun, Ying and Campbell, Joshua D and Lenburg, Marc E and Bild, Andrea H and Johnson, W Evan},
  date = {2012-12},
  doi = {10/f4gwz9},
  eprint = {22959562},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Piccolo et al. - 2012 - A single-sample microarray normalization method to.pdf},
  issn = {1089-8646},
  journaltitle = {Genomics},
  keywords = {Analysis of Variance,Fluorescence,Gene Expression Profiling,Gene Expression Profiling: methods,High-Throughput Screening Assays,High-Throughput Screening Assays: methods,Humans,Individualized Medicine,Individualized Medicine: methods,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,Sample Size,Selection Bias,Signal-To-Noise Ratio,Workflow},
  number = {6},
  pages = {337-44},
  title = {A Single-Sample Microarray Normalization Method to Facilitate Personalized-Medicine Workflows.},
  volume = {100}
}

@article{Piechota2016,
  abstract = {BACKGROUND: The regulation of gene expression in eukaryotic cells is a complex process that involves epigenetic modifications and the interaction of DNA with multiple transcription factors. This process can be studied with unprecedented sensitivity using a combination of chromatin immunoprecipitation and next-generation DNA sequencing (ChIP-seq). Available ChIP-seq data can be further utilized to interpret new gene expression profiling experiments.\textbackslash{}n\textbackslash{}nRESULTS: Here, we describe seqinspector, a tool that accepts any set of genomic coordinates from ChIP-seq or RNA-seq studies to identify shared transcriptional regulators. The presented web resource includes a large collection of publicly available ChIP-seq and RNA-seq experiments ({$>$}1300 tracks) performed on transcription factors, histone modifications, RNA polymerases, enhancers and insulators in humans and mice. Over-representation is calculated based on the coverage computed directly from indexed files storing ChIP-seq data (bigwig). Therefore, seqinspector is not limited to pre-computed sets of gene promoters.\textbackslash{}n\textbackslash{}nCONCLUSION: The tool can be used to identify common gene expression regulators for sets of co-expressed transcripts (including miRNAs, lncRNAs or any novel unannotated RNAs) or for sets of ChIP-seq peaks to identify putative protein-protein interactions or transcriptional co-factors. The tool is available at http://seqinspector.cremag.org .},
  author = {Piechota, Marcin and Korostynski, Michal and Ficek, Joanna and Tomski, Andrzej and Przewlocki, Ryszard},
  date = {2016},
  doi = {10/ggcxmb},
  eprint = {26868127},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Piechota et al. - 2016 - Seqinspector Position-based navigation through th.pdf},
  issn = {14712105},
  journaltitle = {BMC Bioinformatics},
  keywords = {ChIP-seq,Gene expression,Microarray,Promoter analysis,RNA-seq,Transcription factor},
  number = {1},
  pages = {1-7},
  title = {Seqinspector: {{Position}}-Based Navigation through the {{ChIP}}-Seq Data Landscape to Identify Gene Expression Regulators},
  volume = {17}
}

@article{Pimentel2016,
  abstract = {We describe a novel method for the differential analysis of RNA-Seq data that utilizes bootstrapping in conjunction with response error linear modeling to decouple biological variance from inferential variance. The method is implemented in an interactive shiny app called sleuth that utilizes kallisto quantifications and bootstraps for fast and accurate analysis of RNA-Seq experiments.},
  author = {Pimentel, Harold J and Bray, Nicolas and Puente, Suzette and Melsted, P\'all and Pachter, Lior},
  date = {2016},
  doi = {10/gfn5bn},
  file = {/Users/ryan/Documents/Zotero Library/Pimentel et al. - 2016 - Differential analysis of RNA-Seq incorporating qua.pdf},
  journaltitle = {bioRxiv},
  pages = {058164},
  title = {Differential Analysis of {{RNA}}-{{Seq}} Incorporating Quantification Uncertainty}
}

@article{Pinheiro2012,
  abstract = {Genetic information storage and processing rely on just two polymers, DNA and RNA, yet whether their role reflects evolutionary history or fundamental functional constraints is currently unknown. With the use of polymerase evolution and design, we show that genetic information can be stored in and recovered from six alternative genetic polymers based on simple nucleic acid architectures not found in nature [xeno-nucleic acids (XNAs)]. We also select XNA aptamers, which bind their targets with high affinity and specificity, demonstrating that beyond heredity, specific XNAs have the capacity for Darwinian evolution and folding into defined structures. Thus, heredity and evolution, two hallmarks of life, are not limited to DNA and RNA but are likely to be emergent properties of polymers capable of information storage.},
  author = {Pinheiro, Vitor B and Taylor, Alexander I and Cozens, Christopher and Abramov, Mikhail and Renders, Marleen and Zhang, Su and Chaput, John C and Wengel, Jesper and Peak-Chew, Sew-Yeu and McLaughlin, Stephen H and Herdewijn, Piet and Holliger, Philipp},
  date = {2012-04-20},
  doi = {10/ggcxmc},
  eprint = {22517858},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Pinheiro et al. - 2012 - Synthetic genetic polymers capable of heredity and.pdf},
  issn = {1095-9203},
  journaltitle = {Science (New York, N.Y.)},
  keywords = {Aptamers; Nucleotide,Aptamers; Nucleotide: chemistry,Aptamers; Nucleotide: genetics,Aptamers; Nucleotide: metabolism,Directed Molecular Evolution,DNA,DNA-Directed DNA Polymerase,DNA-Directed DNA Polymerase: chemistry,DNA-Directed DNA Polymerase: genetics,DNA-Directed DNA Polymerase: metabolism,DNA: chemistry,DNA: genetics,Evolution; Molecular,Molecular Mimicry,Nucleic Acids,Nucleic Acids: chemistry,Nucleic Acids: genetics,Nucleic Acids: metabolism,Polymers,Polymers: chemistry,Polymers: metabolism,Reverse Transcription,RNA,RNA-Directed DNA Polymerase,RNA-Directed DNA Polymerase: chemistry,RNA-Directed DNA Polymerase: metabolism,RNA: chemistry,RNA: genetics,Templates; Genetic,Transcription; Genetic},
  number = {6079},
  pages = {341-4},
  title = {Synthetic Genetic Polymers Capable of Heredity and Evolution.},
  volume = {336}
}

@article{Popendorf2010,
  abstract = {With the number of available genome sequences increasing rapidly, the magnitude of sequence data required for multiple-genome analyses is a challenging problem. When large-scale rearrangements break the collinearity of gene orders among genomes, genome comparison algorithms must first identify sets of short well-conserved sequences present in each genome, termed anchors. Previously, anchor identification among multiple genomes has been achieved using pairwise alignment tools like BLASTZ through progressive alignment tools like TBA, but the computational requirements for sequence comparisons of multiple genomes quickly becomes a limiting factor as the number and scale of genomes grows.},
  author = {Popendorf, Kris and Tsuyoshi, Hachiya and Osana, Yasunori and Sakakibara, Yasubumi},
  date = {2010-01},
  doi = {10/bph9sb},
  eprint = {20885980},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Popendorf et al. - 2010 - Murasaki a fast, parallelizable algorithm to find.pdf},
  issn = {1932-6203},
  journaltitle = {PloS one},
  keywords = {Algorithms,Animals,Bacteria,Bacteria: chemistry,Bacteria: genetics,Cattle,Conserved Sequence,Dogs,Genome,Humans,Mammals,Mammals: genetics,Mice,Rats,Sequence Alignment,Sequence Alignment: methods},
  number = {9},
  pages = {e12651},
  title = {Murasaki: A Fast, Parallelizable Algorithm to Find Anchors from Multiple Genomes.},
  volume = {5}
}

@article{Pounds2005,
  abstract = {MOTIVATION: There is not a widely applicable method to determine the sample size for experiments basing statistical significance on the false discovery rate (FDR).

RESULTS: We propose and develop the anticipated FDR (aFDR) as a conceptual tool for determining sample size. We derive mathematical expressions for the aFDR and anticipated average statistical power. These expressions are used to develop a general algorithm to determine sample size. We provide specific details on how to implement the algorithm for a k-group (k {$>$} or = 2) comparisons. The algorithm performs well for k-group comparisons in a series of traditional simulations and in a real-data simulation conducted by resampling from a large, publicly available dataset.

AVAILABILITY: Documented S-plus and R code libraries are freely available from www.stjuderesearch.org/depts/biostats.},
  author = {Pounds, Stan and Cheng, Cheng},
  date = {2005-12-01},
  doi = {10/dcbn4j},
  eprint = {16204346},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Pounds and Cheng - 2005 - Sample size determination for the false discovery .pdf},
  issn = {1367-4803},
  journaltitle = {Bioinformatics (Oxford, England)},
  keywords = {Algorithms,Computational Biology,Computational Biology: methods,Computer Simulation,Data Interpretation; Statistical,Databases; Protein,False Positive Reactions,Gene Expression Profiling,Models; Genetic,Models; Statistical,Oligonucleotide Array Sequence Analysis,Reproducibility of Results,Sample Size,Software},
  number = {23},
  pages = {4263-71},
  title = {Sample Size Determination for the False Discovery Rate.},
  volume = {21}
}

@article{Pounds2006,
  abstract = {Motivation: Presently available methods that use p-values to estimate or control the false discovery rate (FDR) implicitly assume that p-values are continuously distributed and based on two-sided tests. Therefore, it is difficult to reliably estimate the FDR when p-values are discrete or based on one-sided tests.Results: A simple and robust method to estimate the FDR is proposed. The proposed method does not rely on implicit assumptions that tests are two-sided or yield continuously distributed p-values. The proposed method is proven to be conservative and have desirable large-sample properties. In addition, the proposed method was among the best performers across a series of `real data simulations' comparing the performance of five currently available methods.Availability: Libraries of S-plus and R routines to implement the method are freely available from www.stjuderesearch.org/depts/biostatsContact:stanley.pounds@stjude.orgSupplementary information: Supplementary data are avilable at Bioinformatics online.},
  author = {Pounds, Stan and Cheng, Cheng},
  date = {2006},
  doi = {10/dhxk6v},
  eprint = {16777905},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Mendeley Desktop/Pounds, Cheng - 2006 - Robust estimation of the false discovery rate.r;/Users/ryan/Documents/Zotero Library/Pounds and Cheng - 2006 - Robust estimation of the false discovery rate.pdf},
  isbn = {1367-4811 (Electronic) 1367-4803 (Linking)},
  issn = {13674803},
  journaltitle = {Bioinformatics},
  keywords = {★},
  number = {16},
  pages = {1979-1987},
  title = {Robust Estimation of the False Discovery Rate},
  volume = {22}
}

@article{Pounds2006a,
  abstract = {The analysis of microarray data often involves performing a large number of statistical tests, usually at least one test per queried gene. Each test has a certain probability of reaching an incorrect inference; therefore, it is crucial to estimate or control error rates that measure the occurrence of erroneous conclusions in reporting and interpreting the results of a microarray study. In recent years, many innovative statistical methods have been developed to estimate or control various error rates for microarray studies. Researchers need guidance choosing the appropriate statistical methods for analysing these types of data sets. This review describes a family of methods that use a set of P-values to estimate or control the false discovery rate and similar error rates. Finally, these methods are classified in a manner that suggests the appropriate method for specific applications and diagnostic procedures that can identify problems in the analysis are described.},
  author = {Pounds, Stanley B.},
  date = {2006},
  doi = {10/fdk87g},
  eprint = {16761362},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Pounds - 2006 - Estimation and control of multiple testing error r.pdf},
  isbn = {1467-5463},
  issn = {14675463},
  journaltitle = {Briefings in Bioinformatics},
  keywords = {Error rate,False discovery rate,Gene expression,Microarray,Multiple testing,Statistical analysis},
  number = {1},
  pages = {25-36},
  title = {Estimation and Control of Multiple Testing Error Rates for Microarray Studies},
  volume = {7}
}

@article{Purdom2008,
  abstract = {MOTIVATION: Analyses of EST data show that alternative splicing is much more widespread than once thought. The advent of exon and tiling microarrays means that researchers now have the capacity to experimentally measure alternative splicing on a genome wide level. New methods are needed to analyze the data from these arrays. RESULTS: We present a method, finding isoforms using robust multichip analysis (FIRMA), for detecting differential alternative splicing in exon array data. FIRMA has been developed for Affymetrix exon arrays, but could in principle be extended to other exon arrays, tiling arrays or splice junction arrays. We have evaluated the method using simulated data, and have also applied it to two datasets: a panel of 11 human tissues and a set of 10 pairs of matched normal and tumor colon tissue. FIRMA is able to detect exons in several genes confirmed by reverse transcriptase PCR. AVAILABILITY: R code implementing our methods is contributed to the package aroma.affymetrix.},
  author = {Purdom, E and Simpson, K M and Robinson, M D and Conboy, J G and Lapuk, a V and Speed, T P},
  date = {2008-08-01},
  doi = {10/dx8mws},
  eprint = {18573797},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Purdom et al. - 2008 - FIRMA a method for detection of alternative splic.pdf},
  issn = {1367-4811},
  journaltitle = {Bioinformatics (Oxford, England)},
  keywords = {Algorithms,Chromosome Mapping,Chromosome Mapping: methods,Databases; Genetic,DNA,DNA: methods,Expressed Sequence Tags,Genetic,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,Reverse Transcriptase Polymerase Chain Reaction,Reverse Transcriptase Polymerase Chain Reaction: m,RNA Splice Sites,RNA Splice Sites: genetics,Sequence Analysis; DNA,Sequence Analysis; DNA: methods},
  number = {15},
  pages = {1707-14},
  title = {{{FIRMA}}: A Method for Detection of Alternative Splicing from Exon Array Data.},
  volume = {24}
}

@article{Quail2012,
  author = {Quail, Michael and Smith, Miriam E and Coupland, Paul and Otto, Thomas D and Harris, Simon R and Connor, Thomas R and Bertoni, Anna and Swerdlow, Harold P and Gu, Yong},
  date = {2012},
  doi = {10/gb3d9d},
  file = {/Users/ryan/Documents/Zotero Library/Quail et al. - 2012 - A tale of three next generation sequencing platfor.pdf},
  issn = {1471-2164},
  journaltitle = {BMC Genomics},
  number = {1},
  pages = {341},
  title = {A Tale of Three next Generation Sequencing Platforms: Comparison of {{Ion}} Torrent, Pacific Biosciences and Illumina {{MiSeq}} Sequencers},
  volume = {13}
}

@book{R-lang,
  author = {{R Core Team}},
  date = {2019},
  date-added = {2019-10-01 17:51:36 -0700},
  date-modified = {2019-10-01 17:52:10 -0700},
  ids = {rcoreteamLanguageEnvironmentStatistical2019},
  location = {{Vienna, Austria}},
  organization = {{R Foundation for Statistical Computing}},
  title = {R: {{A}} Language and Environment for Statistical Computing},
  url = {https://www.R-project.org/}
}

@article{Raabe2013,
  author = {a. Raabe, C. and Tang, T.-H. and Brosius, J. and Rozhdestvensky, T. S.},
  date = {2013-11-05},
  doi = {10/f5rps8},
  file = {/Users/ryan/Documents/Zotero Library/Raabe et al. - 2013 - Biases in small RNA deep sequencing data.pdf},
  issn = {0305-1048},
  journaltitle = {Nucleic Acids Research},
  pages = {1-13},
  title = {Biases in Small {{RNA}} Deep Sequencing Data}
}

@article{Ramachandran2013,
  author = {Ramachandran, Parameswaran and Perkins, Theodore J},
  date = {2013},
  doi = {10/ggcxmf},
  file = {/Users/ryan/Documents/Zotero Library/Ramachandran and Perkins - 2013 - Adaptive bandwidth kernel density estimation for n.pdf},
  issn = {1753-6561},
  issue = {Suppl 7},
  journaltitle = {BMC Proceedings},
  pages = {S7},
  title = {Adaptive Bandwidth Kernel Density Estimation for Next-Generation Sequencing Data},
  volume = {7}
}

@article{Raman2014a,
  author = {Raman, Indira M.},
  date = {2014-01},
  doi = {10/gddmjp},
  file = {/Users/ryan/Documents/Zotero Library/Raman - 2014 - How to Be a Graduate Advisee.pdf},
  issn = {08966273},
  journaltitle = {Neuron},
  number = {1},
  pages = {9-11},
  title = {How to {{Be}} a {{Graduate Advisee}}},
  volume = {81}
}

@article{Rangaraju2015,
  author = {Rangaraju, Sunitha and Solis, Gregory M. and Andersson, Sofia I. and Gomez-Amaro, Rafael L. and Kardakaris, Rozina and Broaddus, Caroline D. and Niculescu, Alexander B. and Petrascheck, Michael},
  date = {2015},
  doi = {10/ggcxmd},
  file = {/Users/ryan/Documents/Zotero Library/Rangaraju et al. - 2015 - Atypical antidepressants extend lifespan of iCae.pdf},
  issn = {14749718},
  issue = {June},
  journaltitle = {Aging Cell},
  keywords = {antidepressant},
  pages = {n/a-n/a},
  title = {Atypical Antidepressants Extend Lifespan of {{{\emph{Caenorhabditis}}}}{\emph{ Elegans}} by Activation of a Non-Cell-Autonomous Stress Response},
  volume = {1}
}

@article{Rangaraju2015a,
  abstract = {Longevity mechanisms increase lifespan by counteracting the effects of aging. However, whether longevity mechanisms counteract the effects of aging continually throughout life, or whether they act during specific periods of life, preventing changes that precede mortality is unclear. Here, we uncover transcriptional drift, a phenomenon that describes how aging causes genes within functional groups to change expression in opposing directions. These changes cause a transcriptome-wide loss in mRNA stoichiometry and loss of co-expression patterns in aging animals, as compared to young adults. Using Caenorhabditis elegans as a model, we show that extending lifespan by inhibiting serotonergic signals by the antidepressant mianserin attenuates transcriptional drift, allowing the preservation of a younger transcriptome into an older age. Our data are consistent with a model in which inhibition of serotonergic signals slows age-dependent physiological decline and the associated rise in mortality levels exclusively in young adults, thereby postponing the onset of major mortality.},
  author = {Rangaraju, Sunitha and Solis, Gregory M. and Thompson, Ryan C. and Gomez-Amaro, Rafael L. and Kurian, Leo and Encalada, Sandra E. and Niculescu, Alexander B. and Salomon, Daniel R. and Petrascheck, Michael},
  date = {2015-12-01},
  doi = {10/ggcxmg},
  eprint = {26623667},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Rangaraju et al. - 2015 - Suppression of transcriptional drift extends C. el.pdf;/Users/ryan/Documents/Zotero Library/Rangaraju et al. - 2015 - Suppression of transcriptional drift extends C. el2.pdf},
  isbn = {2050-084x},
  issn = {2050-084X},
  issue = {December2015},
  journaltitle = {eLife},
  pages = {1-39},
  title = {Suppression of Transcriptional Drift Extends {{C}}. Elegans Lifespan by Postponing the Onset of Mortality},
  volume = {4}
}

@article{Rapaport2013,
  author = {Rapaport, Franck and Khanin, Raya and Liang, Yupu and Pirun, Mono and Krek, Azra and Zumbo, Paul and Mason, Christopher E and Socci, Nicholas D and Betel, Doron},
  date = {2013},
  doi = {10/gfs2kn},
  file = {/Users/ryan/Documents/Zotero Library/Rapaport et al. - 2013 - Comprehensive evaluation of differential gene expr.pdf},
  issn = {1465-6906},
  journaltitle = {Genome Biology},
  number = {9},
  pages = {R95},
  title = {Comprehensive Evaluation of Differential Gene Expression Analysis Methods for {{RNA}}-Seq Data},
  volume = {14}
}

@article{Rashid2011,
  abstract = {ZINBA (Zero-Inflated Negative Binomial Algorithm) identifies genomic regions enriched in a variety of ChIP-seq and related next-generation sequencing experiments (DNA-seq), calling both broad and narrow modes of enrichment across a range of signal-to-noise ratios. ZINBA models and accounts for factors that co-vary with background or experimental signal, such as G/C content, and identifies enrichment in genomes with complex local copy number variations. ZINBA provides a single unified framework for analyzing DNA-seq experiments in challenging genomic contexts.},
  author = {Rashid, Naim U and Giresi, Paul G and Ibrahim, Joseph G and Sun, Wei and Lieb, Jason D},
  date = {2011-01},
  doi = {10/c78mbz},
  eprint = {21787385},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Rashid et al. - 2011 - ZINBA integrates local covariates with DNA-seq dat.pdf},
  issn = {1465-6914},
  journaltitle = {Genome biology},
  keywords = {Algorithms,Computer Simulation,DNA Copy Number Variations,Genomics,Genomics: methods,Models; Genetic,Models; Statistical,Sequence Analysis; DNA,Software},
  number = {7},
  pages = {R67},
  title = {{{ZINBA}} Integrates Local Covariates with {{DNA}}-Seq Data to Identify Broad and Narrow Regions of Enrichment, Even within Amplified Genomic Regions.},
  volume = {12}
}

@article{Reeb2013,
  abstract = {Validating statistical analysis methods for RNA sequencing (RNA-seq) experiments is a complex task. Researchers often find themselves having to decide between competing models or assessing the reliability of results obtained with a designated analysis program. Computer simulation has been the most frequently used procedure to verify the adequacy of a model. However, datasets generated by simulations depend on the parameterization and the assumptions of the selected model. Moreover, such datasets may constitute a partial representation of reality as the complexity or RNA-seq data is hard to mimic. We present the use of plasmode datasets to complement the evaluation of statistical models for RNA-seq data. A plasmode is a dataset obtained from experimental data but for which come truth is known. Using a set of simulated scenarios of technical and biological replicates, and public available datasets, we illustrate how to design algorithms to construct plasmodes under different experimental conditions. We contrast results from two types of methods for RNA-seq: (1) models based on negative binomial distribution (edgeR and DESeq), and (2) Gaussian models applied after transformation of data (MAANOVA). Results emphasize the fact that deciding what method to use may be experiment-specific due to the unknown distributions of expression levels. Plasmodes may contribute to choose which method to apply by using a similar pre-existing dataset. The promising results obtained from this approach, emphasize the need of promoting and improving systematic data sharing across the research community to facilitate plasmode building. Although we illustrate the use of plasmode for comparing differential expression analysis models, the flexibility of plasmode construction allows comparing upstream analysis, as normalization procedures or alignment pipelines, as well.},
  author = {Reeb, Pablo D. and Steibel, Juan P.},
  date = {2013},
  doi = {10/ggcxmh},
  file = {/Users/ryan/Documents/Zotero Library/Reeb and Steibel - 2013 - Evaluating statistical analysis models for RNA seq.pdf},
  issn = {1664-8021},
  issue = {September},
  journaltitle = {Frontiers in Genetics},
  keywords = {line,linear models,plasmodes,RNA-seq,simulation,type I error},
  pages = {1-9},
  title = {Evaluating Statistical Analysis Models for {{RNA}} Sequencing Experiments},
  volume = {4}
}

@article{Remus2009,
  abstract = {The initiation of chromosomal DNA replication involves initiator proteins that recruit and load hexameric DNA helicases at replication origins. This helicase loading step is tightly regulated in bacteria and eukaryotes. In contrast to the situation in bacteria, the eukaryotic helicase is loaded in an inactive form. This extra 'lock and load' mechanism in eukaryotes allows regulation of a second step, helicase activation. The temporal separation of helicase loading and activation is crucial for the coordination of DNA replication with cell growth and extracellular signals, the prevention of re-replication and the control of origin activity in response to replication stress. Initiator proteins in bacteria and eukaryotes are structurally homologous; yet the replicative helicases they load are unrelated. Understanding how these helicases are loaded and how they act during unwinding may have important implications for understanding how DNA replication is regulated in different domains of life.},
  author = {Remus, Dirk and Diffley, John F X},
  date = {2009-12},
  doi = {10/fkxk7z},
  eprint = {19767190},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Remus and Diffley - 2009 - Eukaryotic DNA replication control lock and load,.pdf},
  issn = {1879-0410},
  journaltitle = {Current opinion in cell biology},
  keywords = {Animals,Cell Cycle,Cell Cycle Proteins,Cell Cycle Proteins: genetics,Cell Cycle Proteins: metabolism,Cell Cycle: genetics,DNA,DNA Helicases,DNA Helicases: genetics,DNA Helicases: metabolism,DNA Replication,DNA: chemistry,DNA: metabolism,DnaB Helicases,DnaB Helicases: genetics,DnaB Helicases: metabolism,Eukaryota,Eukaryota: genetics,Eukaryota: physiology,Humans,Models; Biological,Saccharomyces cerevisiae Proteins,Saccharomyces cerevisiae Proteins: genetics,Saccharomyces cerevisiae Proteins: metabolism},
  number = {6},
  pages = {771-7},
  title = {Eukaryotic {{DNA}} Replication Control: Lock and Load, Then Fire.},
  volume = {21}
}

@article{Reyes2013,
  abstract = {Alternative usage of exons provides genomes with plasticity to produce different transcripts from the same gene, modulating the function, localization, and life cycle of gene products. It affects most human genes. For a limited number of cases, alternative functions and tissue-specific roles are known. However, recent high-throughput sequencing studies have suggested that much alternative isoform usage across tissues is nonconserved, raising the question of the extent of its functional importance. We address this question in a genome-wide manner by analyzing the transcriptomes of five tissues for six primate species, focusing on exons that are 1:1 orthologous in all six species. Our results support a model in which differential usage of exons has two major modes: First, most of the exons show only weak differences, which are dominated by interspecies variability and may reflect neutral drift and noisy splicing. These cases dominate the genome-wide view and explain why conservation appears to be so limited. Second, however, a sizeable minority of exons show strong differences between tissues, which are mostly conserved. We identified a core set of 3,800 exons from 1,643 genes that show conservation of strongly tissue-dependent usage patterns from human at least to macaque. This set is enriched for exons encoding protein-disordered regions and untranslated regions. Our findings support the theory that isoform regulation is an important target of evolution in primates, and our method provides a powerful tool for discovering potentially functional tissue-dependent isoforms.},
  author = {Reyes, Alejandro and Anders, Simon and Weatheritt, Robert J and Gibson, Toby J and Steinmetz, Lars M and Huber, Wolfgang},
  date = {2013-09-17},
  doi = {10/f5h64s},
  eprint = {24003148},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Reyes et al. - 2013 - Drift and conservation of differential exon usage .pdf},
  issn = {1091-6490},
  journaltitle = {Proceedings of the National Academy of Sciences of the United States of America},
  number = {38},
  pages = {15377-82},
  title = {Drift and Conservation of Differential Exon Usage across Tissues in Primate Species.},
  volume = {110}
}

@article{Reyes2017,
  abstract = {@rtraborn  Important new preprint from @wolfgangkhuber: alternate TSSs and TTSs underpin most transcript isoform differences in human tissues.\textbackslash{}r\textbackslash{}n\textbackslash{}r\textbackslash{}nMost human genes have multiple transcription start and polyadenylation sites, as well as alternatively spliced exons. While transcript isoform diversity contributes to shape cellular specificity, it is currently unclear what is the balance of contributions from alternative splicing compared to alternative start and termination sites of transcription. Here, we address this question by analyzing data from the Genotype-Tissue Expression Project. We found tissue-dependent usage of exons for around one-half of expressed genes. Although tissue-dependent splicing was frequent among untranslated exons, it explained less than half of the differences in exon usage across tissues, suggesting that most of these differences were driven by alternative transcription start and termination sites. Analysis of the FANTOM Project data confirmed widespread tissue-dependent usage of alternative transcriptional start sites. Our analysis highlights alternative initiation and termination sites of transcription as the main drivers of isoform diversity across tissues. We also show that most tissue-dependent splicing is unlikely to have consequences at the proteome level.\textbackslash{}r\textbackslash{}n\textbackslash{}r\textbackslash{}n},
  author = {Reyes, Alejandro and Huber, Wolfgang},
  date = {2017},
  doi = {10/ggcxmj},
  file = {/Users/ryan/Documents/Zotero Library/Reyes and Huber - 2017 - Transcript Isoform Differences Across Human Tissue.pdf},
  journaltitle = {bioRxiv},
  pages = {1-23},
  title = {Transcript {{Isoform Differences Across Human Tissues Are Predominantly Driven By Alternative Start And Termination Sites Of Transcription}}}
}

@article{Ritchie2006,
  abstract = {Background: Assessment of array quality is an essential step in the analysis of data from microarray experiments. Once detected, less reliable arrays are typically excluded or "filtered" from further analysis to avoid misleading results. Results: In this article, a graduated approach to array quality is considered based on empirical reproducibility of the gene expression measures from replicate arrays. Weights are assigned to each microarray by fitting a heteroscedastic linear model with shared array variance terms. A novel gene-by-gene update algorithm is used to efficiently estimate the array variances. The inverse variances are used as weights in the linear model analysis to identify differentially expressed genes. The method successfully assigns lower weights to less reproducible arrays from different experiments. Down-weighting the observations from suspect arrays increases the power to detect differential expression. In smaller experiments, this approach outperforms the usual method of filtering the data. The method is available in the limma software package which is implemented in the R software environment. Conclusion: This method complements existing normalisation and spot quality procedures, and allows poorer quality arrays, which would otherwise be discarded, to be included in an analysis. It is applicable to microarray data from experiments with some level of replication. \textcopyright{} 2006 Ritchie et al; licensee BioMed Central Ltd.},
  author = {Ritchie, Matthew E. and Diyagama, Dileepa and Neilson, Jody and van Laar, Ryan and Dobrovic, Alexander and Holloway, Andrew and Smyth, Gordon K.},
  date = {2006},
  doi = {10/dxrnmz},
  file = {/Users/ryan/Documents/Zotero Library/Ritchie et al. - 2006 - Empirical array quality weights in the analysis of.pdf},
  issn = {14712105},
  journaltitle = {BMC Bioinformatics},
  options = {useprefix=true},
  title = {Empirical Array Quality Weights in the Analysis of Microarray Data},
  volume = {7}
}

@article{Ritchie2015,
  abstract = {limma is an R/Bioconductor software package that provides an integrated solution for analysing data from gene expression experiments. It contains rich features for handling complex experimental designs and for information borrowing to overcome the problem of small sample sizes. Over the past decade, limma has been a popular choice for gene discovery through differential expression analyses of microarray and high-throughput PCR data. The package contains particularly strong facilities for reading, normalizing and exploring such data. Recently, the capabilities of limma have been significantly expanded in two important directions. First, the package can now perform both differential expression and differential splicing analyses of RNA sequencing (RNA-seq) data. All the downstream analysis tools previously restricted to microarray data are now available for RNA-seq as well. These capabilities allow users to analyse both RNA-seq and microarray data with very similar pipelines. Second, the package is now able to go past the traditional gene-wise expression analyses in a variety of ways, analysing expression profiles in terms of co-regulated sets of genes or in terms of higher-order expression signatures. This provides enhanced possibilities for biological interpretation of gene expression differences. This article reviews the philosophy and design of the limma package, summarizing both new and historical features, with an emphasis on recent enhancements and features that have not been previously described.},
  author = {Ritchie, Matthew E. and Phipson, Belinda and Wu, Di and Hu, Yifang and Law, Charity W. and Shi, Wei and Smyth, Gordon K.},
  date = {2015},
  doi = {10/f7c4n5},
  eprint = {25605792},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Ritchie et al. - 2015 - limma powers differential expression analyses for .pdf},
  isbn = {0305-1048},
  issn = {13624962},
  journaltitle = {Nucleic Acids Research},
  number = {7},
  pages = {1-13},
  title = {Limma Powers Differential Expression Analyses for {{RNA}}-Sequencing and Microarray Studies},
  volume = {43}
}

@article{Roberts2013,
  author = {Roberts, Adam and Pachter, Lior},
  date = {2013},
  doi = {10/gfkms5},
  file = {/Users/ryan/Documents/Zotero Library/Roberts and Pachter - 2013 - Streaming fragment assignment for real-time analys.pdf},
  number = {1},
  title = {Streaming Fragment Assignment for Real-Time Analysis of Sequencing Experiments},
  volume = {10}
}

@article{Robinson2007,
  abstract = {Digital gene expression (DGE) technologies measure gene expression by counting sequence tags. They are sensitive technologies for measuring gene expression on a genomic scale, without the need for prior knowledge of the genome sequence. As the cost of sequencing DNA decreases, the number of DGE datasets is expected to grow dramatically. Various tests of differential expression have been proposed for replicated DGE data using binomial, Poisson, negative binomial or pseudo-likelihood (PL) models for the counts, but none of the these are usable when the number of replicates is very small.},
  author = {Robinson, Mark D and Smyth, Gordon K},
  date = {2007-11-01},
  doi = {10/b4kn7x},
  eprint = {17881408},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Robinson and Smyth - 2007 - Moderated statistical tests for assessing differen.pdf},
  issn = {1367-4811},
  journaltitle = {Bioinformatics (Oxford, England)},
  keywords = {Algorithms,Computer Simulation,Data Interpretation; Statistical,Expressed Sequence Tags,Gene Expression Profiling,Gene Expression Profiling: methods,Likelihood Functions,Models; Genetic,Models; Statistical,Poisson Distribution,Sequence Analysis; DNA,Sequence Analysis; DNA: methods,Signal Processing; Computer-Assisted},
  number = {21},
  pages = {2881-7},
  title = {Moderated Statistical Tests for Assessing Differences in Tag Abundance.},
  volume = {23}
}

@article{Robinson2008,
  abstract = {We derive a quantile-adjusted conditional maximum likelihood estimator for the dispersion parameter of the negative binomial distribution and compare its performance, in terms of bias, to various other methods. Our estimation scheme outperforms all other methods in very small samples, typical of those from serial analysis of gene expression studies, the motivating data for this study. The impact of dispersion estimation on hypothesis testing is studied. We derive an "exact" test that outperforms the standard approximate asymptotic tests.},
  author = {Robinson, Mark D and Smyth, Gordon K},
  date = {2008-04},
  doi = {10/d7gcxn},
  eprint = {17728317},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Robinson and Smyth - 2008 - Small-sample estimation of negative binomial dispe.pdf},
  issn = {1465-4644},
  journaltitle = {Biostatistics (Oxford, England)},
  keywords = {Bias (Epidemiology),Binomial Distribution,Biometry,Biometry: methods,Expressed Sequence Tags,Gene Expression Profiling,Gene Expression Profiling: methods,Gene Expression Profiling: statistics & numerical,Gene Library,Humans,Information Storage and Retrieval,Information Storage and Retrieval: methods,Information Storage and Retrieval: statistics & nu,Likelihood Functions,Regression Analysis,Research Design,Research Design: statistics & numerical data,RNA; Messenger,RNA; Messenger: analysis,Sample Size,Stochastic Processes,Weights and Measures},
  number = {2},
  pages = {321-32},
  title = {Small-Sample Estimation of Negative Binomial Dispersion, with Applications to {{SAGE}} Data.},
  volume = {9}
}

@article{Robinson2010,
  abstract = {The fine detail provided by sequencing-based transcriptome surveys suggests that RNA-seq is likely to become the platform of choice for interrogating steady state RNA. In order to discover biologically important changes in expression, we show that normalization continues to be an essential step in the analysis. We outline a simple and effective method for performing normalization and show dramatically improved results for inferring differential expression in simulated and publicly available data sets.},
  author = {Robinson, Mark D and Oshlack, Alicia},
  date = {2010-01},
  doi = {10/cq6f8b},
  eprint = {20196867},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Robinson and Oshlack - 2010 - A scaling normalization method for differential ex.pdf},
  issn = {1465-6906},
  journaltitle = {Genome Biology},
  keywords = {Base Sequence,Base Sequence: genetics,Computer Simulation,Gene Expression Profiling,Gene Expression Profiling: methods,Gene Library,Models,RNA,RNA: genetics,Statistical},
  number = {3},
  pages = {R25},
  title = {A Scaling Normalization Method for Differential Expression Analysis of {{RNA}}-Seq Data},
  volume = {11}
}

@article{Robinson2010a,
  abstract = {SUMMARY: It is expected that emerging digital gene expression (DGE) technologies will overtake microarray technologies in the near future for many functional genomics applications. One of the fundamental data analysis tasks, especially for gene expression studies, involves determining whether there is evidence that counts for a transcript or exon are significantly different across experimental conditions. edgeR is a Bioconductor software package for examining differential expression of replicated count data. An overdispersed Poisson model is used to account for both biological and technical variability. Empirical Bayes methods are used to moderate the degree of overdispersion across transcripts, improving the reliability of inference. The methodology can be used even with the most minimal levels of replication, provided at least one phenotype or experimental condition is replicated. The software may have other applications beyond sequencing data, such as proteome peptide count data.

AVAILABILITY: The package is freely available under the LGPL licence from the Bioconductor web site (http://bioconductor.org).},
  author = {Robinson, Mark D and McCarthy, Davis J and Smyth, Gordon K},
  date = {2010-01-01},
  doi = {10/drxgw2},
  eprint = {19910308},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Robinson et al. - 2010 - edgeR a Bioconductor package for differential exp.pdf},
  issn = {1367-4811},
  journaltitle = {Bioinformatics (Oxford, England)},
  keywords = {Algorithms,Gene Expression Profiling,Gene Expression Profiling: methods,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,Programming Languages,Signal Processing; Computer-Assisted,Software},
  number = {1},
  pages = {139-40},
  title = {{{edgeR}}: A {{Bioconductor}} Package for Differential Expression Analysis of Digital Gene Expression Data.},
  volume = {26}
}

@article{Robinson2012,
  author = {Chen, Yunshun and Mccarthy, Davis and Robinson, Mark and Smyth, Gordon K},
  date = {2015},
  file = {/Users/ryan/Documents/Zotero Library/Chen et al. - 2015 - edgeR  differential expression analysis of digita.pdf},
  issue = {April},
  title = {{{edgeR}} : Differential Expression Analysis of Digital Gene Expression Data {{User}} ' s {{Guide}}}
}

@article{robinsonThatBLUPGood1991,
  abstract = {In animal breeding, Best Linear Unbiased Prediction, or BLUP, is a technique for estimating genetic merits. In general, it is a method of estimating random effects. It can be used to derive the Kalman filter, the method of Kriging used for ore reserve estimation, credibility theory used to work out insurance premiums, and Hoadley's quality measurement plan used to estimate a quality index. It can be used for removing noise from images and for small-area estimation. This paper presents the theory of BLUP, some examples of its application and its relevance to the foundations of statistics. Understanding of procedures for estimating random effects should help people to understand some complicated and controversial issues about fixed and random effects models and also help to bridge the apparent gulf between the Bayesian and Classical schools of thought.},
  author = {Robinson, G. K.},
  date = {1991-02},
  doi = {10/dthnkh},
  file = {/Users/ryan/Documents/Zotero Library/Robinson - 1991 - That BLUP is a Good Thing The Estimation of Rando.pdf;/Users/ryan/Zotero/storage/H657KSSS/1177011926.html},
  issn = {0883-4237, 2168-8745},
  journaltitle = {Statistical Science},
  keywords = {Best linear unbiased predition (BLUP),credibility theory,estimation of random effects,fixed versus random effects,foundations of statistics,Kalman filtering,likelihood,parametric empirical Bayes methods,ranking and selection,selection index,small-area estimation},
  langid = {english},
  mrnumber = {MR1108815},
  note = {Citation Key Alias: Robinson1991a, robinsonThatBLUPGood1991},
  number = {1},
  pages = {15-32},
  shortjournal = {Statist. Sci.},
  shorttitle = {That {{BLUP}} Is a {{Good Thing}}},
  title = {That {{BLUP}} Is a {{Good Thing}}: {{The Estimation}} of {{Random Effects}}},
  volume = {6},
  zmnumber = {0955.62500}
}

@article{Robles2012,
  abstract = {ABSTRACT: BACKGROUND: RNA sequencing (RNA-Seq) has emerged as a powerful approach for the detection of differential gene expression with both high-throughput and high resolution capabilities possible depending upon the experimental design chosen. Multiplex experimental designs are now readily available, these can be utilised to increase the numbers of samples or replicates profiled at the cost of decreased sequencing depth generated per sample. These strategies impact on the power of the approach to accurately identify differential expression. This study presents a detailed analysis of the power to detect differential expression in a range of scenarios including simulated null and differential expression distributions with varying numbers of biological or technical replicates, sequencing depths and analysis methods. RESULTS: Differential and non-differential expression datasets were simulated using a combination of negative binomial and exponential distributions derived from real RNA-Seq data. These datasets were used to evaluate the performance of three commonly used differential expression analysis algorithms and to quantify the changes in power with respect to true and false positive rates when simulating variations in sequencing depth, biological replication and multiplex experimental design choices. CONCLUSIONS: This work quantitatively explores comparisons between contemporary analysis tools and experimental design choices for the detection of differential expression using RNA-Seq. We found that the DESeq algorithm performs more conservatively than edgeR and NBPSeq. With regard to testing of various experimental designs, this work strongly suggests that greater power is gained through the use of biological replicates relative to library (technical) replicates and sequencing depth. Strikingly, sequencing depth could be reduced as low as 15\% without substantial impacts on false positive or true positive rates.},
  author = {Robles, Jos\'e A and Qureshi, Sumaira E and Stephen, Stuart J and Wilson, Susan R and Burden, Conrad J and Taylor, Jennifer M},
  date = {2012-01-17},
  doi = {10/gb3fq9},
  eprint = {22985019},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Robles et al. - 2012 - Efficient experimental design and analysis strateg.pdf},
  issn = {1471-2164},
  journaltitle = {BMC Genomics},
  keywords = {Algorithms,Gene Expression Profiling,Gene Expression Profiling: methods,RNA,RNA: methods,Sequence Analysis,Statistics as Topic,Statistics as Topic: methods},
  number = {1},
  pages = {484},
  title = {Efficient Experimental Design and Analysis Strategies for the Detection of Differential Expression Using {{RNA}}-{{Sequencing}}},
  volume = {13}
}

@article{Roge2013,
  abstract = {SUMMARY: With the advances of RNA sequencing technologies, scientists need new tools to analyze transcriptome data. We introduce RNAseqViewer, a new visualization tool dedicated to RNA-Seq data. The program offers innovative ways to represent transcriptome data for single or multiple samples. It is a handy tool for scientists who use RNA-Seq data to compare multiple transcriptomes, for example, to compare gene expression and alternative splicing of cancer samples or of different development stages.

AVAILABILITY: RNAseqViewer is freely available for academic use at http://bioinfo.au.tsinghua.edu.cn/software/RNAseqViewer/ CONTACT: zhangxg@tsinghua.edu.cn SUPPLEMENTARY INFORMATION: Supplementary data are available at Bioinformatics online.},
  author = {Rog\'e, Xavier and Zhang, Xuegong},
  date = {2013-11-08},
  doi = {10/f5v2rs},
  eprint = {24215023},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Rogé and Zhang - 2013 - RNAseqViewer Visualization tool for RNA-Seq data..pdf},
  issn = {1367-4811},
  journaltitle = {Bioinformatics (Oxford, England)},
  pages = {2-3},
  title = {{{RNAseqViewer}}: {{Visualization}} Tool for {{RNA}}-{{Seq}} Data.}
}

@article{Rogers2000,
  abstract = {The generation of memory T cells is critically important for rapid clearance and neutralization of pathogens encountered previously by the immune system. We have studied the kinetics of response and Ag dose requirements for proliferation and cytokine secretion of CD4+ memory T cells to examine whether there are qualitative changes which might lead to improved immunity. TCR Tg CD4+ T cells were primed in vitro and transferred into T cell-deficient hosts. After 6 or more weeks, the persisting T cells were exclusively small resting cells with a memory phenotype: CD44high CD62L+/- CD25-. Memory CD4 T cells showed a similar pattern of response as naive cells to peptide analogues with similar Ag dose requirements for IL-2 secretion. However, memory cells (derived from both Th2 and Th1 effectors) displayed faster kinetics of cytokine secretion, cell division, and proliferation, enhanced proliferation in response to low doses of Ag or peptide analogues, and production of IL-4, IL-5, and IFN-gamma. These results suggest there is a much more efficient response of CD4 memory T cells to Ag re-exposure and that the expanded functional capacity of memory cells will promote a rapid development of effector functions, providing more rapid and effective immunity.},
  author = {Rogers, Paul R. and Dubey, Caroline and Swain, Susan L.},
  date = {2000},
  doi = {10/gf4d35},
  eprint = {10679068},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Rogers et al. - 2000 - Qualitative Changes Accompany Memory T Cell Genera.pdf},
  issn = {0022-1767},
  journaltitle = {The Journal of Immunology},
  number = {5},
  pages = {2338-2346},
  title = {Qualitative {{Changes Accompany Memory T Cell Generation}}: {{Faster}}, {{More Effective Responses}} at {{Lower Doses}} of {{Antigen}}},
  volume = {164}
}

@article{Rougemont2012,
  author = {Rougemont, Jacques and Naef, Felix},
  date = {2012},
  doi = {10/cxbjh3},
  editor = {Deplancke, Bart and Gheldof, Nele},
  file = {/Users/ryan/Documents/Zotero Library/Rougemont and Naef - 2012 - Gene Regulatory Networks.pdf},
  isbn = {978-1-61779-291-5},
  keywords = {bioinformatics,chromatin immunoprecipitation,factor binding,transcription regulation,transcriptional,ultra-high-throughput sequencing},
  place = {Totowa, NJ},
  series = {Methods in {{Molecular Biology}}},
  title = {Gene {{Regulatory Networks}}},
  volume = {786}
}

@article{Rozowsky2009,
  abstract = {Chromatin immunoprecipitation (ChIP) followed by tag sequencing (ChIP-seq) using high-throughput next-generation instrumentation is fast, replacing chromatin immunoprecipitation followed by genome tiling array analysis (ChIP-chip) as the preferred approach for mapping of sites of transcription-factor binding and chromatin modification. Using two deeply sequenced data sets for human RNA polymerase II and STAT1, each with matching input-DNA controls, we describe a general scoring approach to address unique challenges in ChIP-seq data analysis. Our approach is based on the observation that sites of potential binding are strongly correlated with signal peaks in the control, likely revealing features of open chromatin. We develop a two-pass strategy called PeakSeq to compensate for this. A two-pass strategy compensates for signal caused by open chromatin, as revealed by inclusion of the controls. The first pass identifies putative binding sites and compensates for genomic variation in the 'mappability' of sequences. The second pass filters out sites not significantly enriched compared to the normalized control, computing precise enrichments and significances. Our scoring procedure enables us to optimize experimental design by estimating the depth of sequencing required for a desired level of coverage and demonstrating that more than two replicates provides only a marginal gain in information.},
  author = {Rozowsky, Joel and Euskirchen, Ghia and Auerbach, Raymond K and Zhang, Zhengdong D and Gibson, Theodore and Bjornson, Robert and Carriero, Nicholas and Snyder, Michael and Gerstein, Mark B},
  date = {2009-01},
  doi = {10/fjmtsj},
  eprint = {19122651},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Rozowsky et al. - 2009 - PeakSeq enables systematic scoring of ChIP-seq exp.pdf},
  issn = {1546-1696},
  journaltitle = {Nature biotechnology},
  keywords = {Binding Sites,Biotechnology,Biotechnology: methods,Chromatin,Chromatin Immunoprecipitation,Chromatin Immunoprecipitation: methods,Chromatin: chemistry,DNA,DNA: chemistry,False Positive Reactions,Genetic Variation,Genome,Genomics,Humans,Models; Genetic,Oligonucleotide Array Sequence Analysis,RNA Polymerase II,RNA Polymerase II: chemistry,Sequence Analysis; DNA,Software},
  number = {1},
  pages = {66-75},
  title = {{{PeakSeq}} Enables Systematic Scoring of {{ChIP}}-Seq Experiments Relative to Controls.},
  volume = {27}
}

@article{Rushton2008,
  abstract = {Background: Regulation of gene expression at the level of transcription is a major control point in many biological processes. Transcription factors (TFs) can activate and/or repress the transcriptional rate of target genes and vascular plant genomes devote approximately 7\% of their coding capacity to TFs. Global analysis of TFs has only been performed for three complete higher plant genomes - Arabidopsis (Arabidopsis thaliana), poplar (Populus trichocarpa) and rice (Oryza sativa). Presently, no large-scale analysis of TFs has been made from a member of the Solanaceae, one of the most important families of vascular plants. To fill this void, we have analysed tobacco (Nicotiana tabacum) TFs using a dataset of 1,159,022 gene-space sequence reads (GSRs) obtained by methylation filtering of the tobacco genome. An analytical pipeline was developed to isolate TF sequences from the GSR data set. This involved multiple (typically 10-15) independent searches with different versions of the TF family-defining domain(s) (normally the DNA-binding domain) followed by assembly into contigs and verification. Our analysis revealed that tobacco contains a minimum of 2,513 TFs representing all of the 64 well-characterised plant TF families. The number of TFs in tobacco is higher than previously reported for Arabidopsis and rice. Results: TOBFAC: the database of tobacco transcription factors, is an integrative database that provides a portal to sequence and phylogeny data for the identified TFs, together with a large quantity of other data concerning TFs in tobacco. The database contains an individual page dedicated to each of the 64 TF families. These contain background information, domain architecture via Pfam links, a list of all sequences and an assessment of the minimum number of TFs in this family in tobacco. Downloadable phylogenetic trees of the major families are provided along with detailed information on the bioinformatic pipeline that was used to find all family members. TOBFAC also contains EST data, a list of published tobacco TFs and a list of papers concerning tobacco TFs. The sequences and annotation data are stored in relational tables using a PostgrelSQL relational database management system. The data processing and analysis pipelines used the Perl programming language. The web interface was implemented in JavaScript and Perl CGI running on an Apache web server. The computationally intensive data processing and analysis pipelines were run on an Apple XServe cluster with more than 20 nodes. Conclusion: TOBFAC is an expandable knowledgebase of tobacco TFs with data currently available for over 2,513 TFs from 64 gene families. TOBFAC integrates available sequence information, phylogenetic analysis, and EST data with published reports on tobacco TF function. The database provides a major resource for the study of gene expression in tobacco and the Solanaceae and helps to fill a current gap in studies of TF families across the plant kingdom. TOBFAC is publicly accessible at http://compsysbio.achs.virginia.edu/tobfac/. \textcopyright{} 2008 Rushton et al; licensee BioMed Central Ltd.},
  author = {Rushton, Paul J. and Bokowiec, Marta T. and Laudeman, Thomas W. and Brannock, Jennifer F. and Chen, Xianfeng and Timko, Michael P.},
  date = {2008},
  doi = {10/fbkwv6},
  file = {/Users/ryan/Documents/Zotero Library/Rushton et al. - 2008 - TOBFAC The database of tobacco transcription fact.pdf},
  issn = {14712105},
  journaltitle = {BMC Bioinformatics},
  number = {1},
  pages = {53},
  title = {{{TOBFAC}}: {{The}} Database of Tobacco Transcription Factors},
  volume = {9}
}

@article{Russo2014,
  abstract = {UNLABELLED: We present RNASeqGUI R package, a graphical user interface (GUI) for the identification of differentially expressed genes across multiple biological conditions. This R package includes some well-known RNA-Seq tools, available at www.bioconductor.org. RNASeqGUI package is not just a collection of some known methods and functions, but it is designed to guide the user during the entire analysis process. RNASeqGUI package is mainly addressed to those users who have little experience with command-line software. Therefore, thanks to RNASeqGUI, they can conduct analogous analyses using this simple graphical interface. Moreover, RNASeqGUI is also helpful for those who are expert R-users because it speeds up the usage of the included RNASeq methods drastically.\textbackslash{}n\textbackslash{}nAVAILABILITY AND IMPLEMENTATION: RNASeqGUI package needs the RGTK2 graphical library to run. This package is open source and is freely available under General Public License at http://bioinfo.na.iac.cnr.it/RNASeqGUI/Download.\textbackslash{}n\textbackslash{}nSUPPLEMENTARY INFORMATION: Supplementary data are available at Bioinformatics online.},
  author = {Russo, Francesco and Angelini, Claudia},
  date = {2014},
  doi = {10/f6j6wr},
  eprint = {24812338},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Russo and Angelini - 2014 - RNASeqGUI A GUI for analysing RNA-Seq data.pdf},
  isbn = {1367-4811 (Electronic)\textbackslash{}r1367-4803 (Linking)},
  issn = {14602059},
  journaltitle = {Bioinformatics},
  number = {17},
  pages = {2514-2516},
  title = {{{RNASeqGUI}}: {{A GUI}} for Analysing {{RNA}}-{{Seq}} Data},
  volume = {30}
}

@article{Russo2016,
  abstract = {We present the advancements and novelties recently introduced in RNASeqGUI, a graphical user interface that helps biologists to handle and analyse large data collected in RNA-Seq experiments. This work focuses on the concept of reproducible research and shows how it has been incorporated in RNASeqGUI to provide reproducible (computational) results. The novel version of RNASeqGUI combines graphical interfaces with tools for reproducible research, such as literate statistical programming, human readable report, parallel executions, caching, and interactive and web-explorable tables of results. These features allow the user to analyse big datasets in a fast, efficient, and reproducible way. Moreover, this paper represents a proof of concept, showing a simple way to develop computational tools for Life Science in the spirit of reproducible research.},
  author = {Russo, Francesco and Righelli, Dario and Angelini, Claudia},
  date = {2016},
  doi = {10/ggcxmk},
  eprint = {26977414},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Russo et al. - 2016 - Advancements in RNASeqGUI towards a Reproducible A.pdf},
  issn = {23146141},
  journaltitle = {BioMed Research International},
  title = {Advancements in {{RNASeqGUI}} towards a {{Reproducible Analysis}} of {{RNA}}-{{Seq Experiments}}},
  volume = {2016}
}

@article{Ryan2007,
  abstract = {The ability of mesenchymal stem cells (MSC) to suppress alloresponsiveness is poorly understood. Herein, an allogeneic mixed lymphocyte response was used as a model to investigate the mechanisms of MSC-mediated immunomodulation. Human MSC are demonstrated to express the immunosuppressive cytokines hepatocyte growth factor (HGF), interleukin (IL)-10 and transforming growth factor (TGF)-{$\beta$}1 at concentrations that suppress alloresponses in vitro. MSC also express cyclooxygenase 1 and 2 and produce prostaglandin E2 constitutively. Blocking studies with indomethacin confirmed that prostaglandins contribute to MSC-mediated allosuppression. The proinflammatory cytokine interferon (IFN)-{$\gamma$} did not ablate MSC inhibition of alloantigen-driven proliferation but up-regulated HGF and TGF-{$\beta$}1. IFN-{$\gamma$} also induced expression of indoleamine 2,3, dioxygenase (IDO), involved in tryptophan catabolism. Use of an antagonist, 1-methyl-L-tryptophan, restored alloresponsiveness and confirmed an IDO contribution to IFN-{$\gamma$}-induced immunomodulation by MSC. Addition of the tryptophan catabolite kynurenine to mixed lymphocyte reactions (MLR), blocked alloproliferation. These findings support a model where IDO exerts its effect through the local accumulation of tryptophan metabolites rather than through tryptophan depletion. Taken together, these data demonstrate that soluble factors, or products derived from MSC, modulate immune responses and suggest that MSC create an immunosuppressive microenvironment capable of modulating alloresponsiveness even in the presence of IFN-{$\gamma$}. \textcopyright{} 2007 The Author(s).},
  author = {Ryan, J. M. and Barry, F. and Murphy, J. M. and Mahon, B. P.},
  date = {2007-08},
  doi = {10/b34t6x},
  file = {/Users/ryan/Documents/Zotero Library/Ryan et al. - 2007 - Interferon-γ does not break, but promotes the immu.pdf},
  issn = {00099104},
  journaltitle = {Clinical and Experimental Immunology},
  keywords = {IFN-γ,Inflammation,Stem cells},
  number = {2},
  pages = {353-363},
  title = {Interferon-{$\gamma$} Does Not Break, but Promotes the Immunosuppressive Capacity of Adult Human Mesenchymal Stem Cells},
  volume = {149}
}

@article{Sales2012a,
  abstract = {BACKGROUND: Gene set analysis is moving towards considering pathway topology as a crucial feature. Pathway elements are complex entities such as protein complexes, gene family members and chemical compounds. The conversion of pathway topology to a gene/protein networks (where nodes are a simple element like a gene/protein) is a critical and challenging task that enables topology-based gene set analyses.Unfortunately, currently available R/Bioconductor packages provide pathway networks only from single databases. They do not propagate signals through chemical compounds and do not differentiate between complexes and gene families.

RESULTS: Here we present graphite, a Bioconductor package addressing these issues. Pathway information from four different databases is interpreted following specific biologically-driven rules that allow the reconstruction of gene-gene networks taking into account protein complexes, gene families and sensibly removing chemical compounds from the final graphs. The resulting networks represent a uniform resource for pathway analyses. Indeed, graphite provides easy access to three recently proposed topological methods. The graphite package is available as part of the Bioconductor software suite.

CONCLUSIONS: graphite is an innovative package able to gather and make easily available the contents of the four major pathway databases. In the field of topological analysis graphite acts as a provider of biological information by reducing the pathway complexity considering the biological meaning of the pathway elements.},
  author = {Sales, Gabriele and Calura, Enrica and Cavalieri, Duccio and Romualdi, Chiara},
  date = {2012-01},
  doi = {10/fxr5c4},
  eprint = {22292714},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Sales et al. - 2012 - graphite - a Bioconductor package to convert pathw.pdf},
  issn = {1471-2105},
  journaltitle = {BMC bioinformatics},
  keywords = {Gene Regulatory Networks,Humans,Insulin,Insulin: metabolism,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,Receptor; Notch1,Receptor; Notch1: metabolism,Signal Transduction,Software},
  number = {1},
  pages = {20},
  title = {Graphite - a {{Bioconductor}} Package to Convert Pathway Topology to Gene Network.},
  volume = {13}
}

@article{Salomon2002,
  author = {Salomon, Daniel R},
  date = {2002-10-01},
  doi = {10/bjm7nv},
  issn = {0272-6386},
  journaltitle = {American Journal of Kidney Diseases},
  keywords = {\#nosource},
  number = {4},
  pages = {674-677},
  title = {Protocol Biopsies Should Be Part of the Routine Management of Kidney Transplant Recipients},
  volume = {40}
}

@article{Salzman2012,
  abstract = {Most human pre-mRNAs are spliced into linear molecules that retain the exon order defined by the genomic sequence. By deep sequencing of RNA from a variety of normal and malignant human cells, we found RNA transcripts from many human genes in which the exons were arranged in a non-canonical order. Statistical estimates and biochemical assays provided strong evidence that a substantial fraction of the spliced transcripts from hundreds of genes are circular RNAs. Our results suggest that a non-canonical mode of RNA splicing, resulting in a circular RNA isoform, is a general feature of the gene expression program in human cells.},
  author = {Salzman, Julia and Gawad, Charles and Wang, Peter Lincoln and Lacayo, Norman and Brown, Patrick O},
  date = {2012-01},
  doi = {10/fzm78m},
  eprint = {22319583},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Salzman et al. - 2012 - Circular RNAs Are the Predominant Transcript Isofo.pdf},
  issn = {1932-6203},
  journaltitle = {PloS one},
  number = {2},
  pages = {e30733},
  title = {Circular {{RNAs Are}} the {{Predominant Transcript Isoform}} from {{Hundreds}} of {{Human Genes}} in {{Diverse Cell Types}}.},
  volume = {7}
}

@article{Sambasivarao2013,
  author = {Sambasivarao, Somisetti V},
  date = {2013},
  file = {/Users/ryan/Documents/Zotero Library/Sambasivarao - 2013 - NIH Public Access.pdf},
  isbn = {6176321972},
  keywords = {computer-aided drug design,cyclophilin,free energy perturbation,hiv,reverse transcriptase},
  number = {9},
  pages = {1199-1216},
  title = {{{NIH Public Access}}},
  volume = {18}
}

@article{Sammeth2009,
  abstract = {Eukaryotic splicing structures are known to involve a high degree of alternative forms derived from a premature transcript by alternative splicing (AS). With the advent of new sequencing technologies, evidence for new splice forms becomes increasingly available-bit by bit revealing that the true splicing diversity of "AS events" often comprises more than two alternatives and therefore cannot be sufficiently described by pairwise comparisons as conducted in analyzes hitherto. Especially, I emphasize on "complete" AS events which include all hitherto known variants of a splicing variation. Challenges emerge from the richness of data (millions of transcripts) and artifacts introduced during the technical process of obtaining transcript sequences ("noise")-especially when dealing with single-read sequences known as expressed sequence tags (ESTs). Herein, I describe a novel method to efficiently predict AS events in different resolutions ("dimensions") from transcript annotations that allows for combination of fragmented EST data with full-length cDNAs and can cope with large datasets containing noise. At the doorstep of many new splice forms becoming available by novel high-throughput sequencing technologies, the presented method helps to dynamically update AS databases. Applying this method to estimate the real complexity of alternative splicing, I found in human and murine annotations thousands of novel AS events that either have been disregarded or mischaracterized in earlier works. The growth of evidence for such events suggests that the number still keeps climbing. When considering complete events, the majority of exons that are observed as "mutually exclusive" in pairwise comparisons in fact involves at least one other alternative splice form that disagrees with their mutual exclusion. Similar observations also hold for the alternative skipping of two subsequent exons. Results suggest that the systematical analysis of complete AS events on large scale provides subtle insights in the mechanisms that drive (alternative) splicing.},
  author = {Sammeth, Michael},
  date = {2009-08},
  doi = {10/fb75mv},
  eprint = {19689216},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Sammeth - 2009 - Complete alternative splicing events are bubbles i.pdf},
  issn = {1557-8666},
  journaltitle = {Journal of computational biology : a journal of computational molecular cell biology},
  keywords = {Alternative Splicing,Animals,Exons,Genomics,Genomics: methods,Humans,Introns,Mice,Models; Genetic,RNA,RNA: genetics},
  number = {8},
  pages = {1117-40},
  title = {Complete Alternative Splicing Events Are Bubbles in Splicing Graphs.},
  volume = {16}
}

@article{Sarkar2017,
  author = {Sarkar, Hirak and Zakeri, Mohsen and Malik, Laraib and Patro, Rob},
  date = {2017},
  doi = {10/ggcxmm},
  file = {/Users/ryan/Documents/Zotero Library/Sarkar et al. - 2017 - Towards selective-alignment  producing accurate a.pdf},
  keywords = {alignment,and phrases mapping,quantification,rna-seq,selective alignment},
  pages = {1-14},
  title = {Towards Selective-Alignment : Producing Accurate and Sensitive Alignments Using Quasi-Mapping}
}

@article{Sarkar2017a,
  author = {Sarkar, Hirak and Patro, Rob},
  date = {2017-07-03},
  doi = {10/gchcn6},
  file = {/Users/ryan/Documents/Zotero Library/Sarkar and Patro - 2017 - Quark enables semi-reference based compression of .pdf},
  issn = {1367-4803},
  journaltitle = {Bioinformatics},
  pages = {1-7},
  title = {Quark Enables Semi-Reference Based Compression of {{RNA}}-Seq Data}
}

@article{Sarwal2003,
  abstract = {BACKGROUND: The causes and clinical course of acute rejection vary, and it is not possible to predict graft outcome reliably on the basis of available clinical, pathological, and genetic markers. We hypothesized that previously unrecognized molecular heterogeneity might underlie some of the variability in the clinical course of acute renal allograft rejection and in its response to treatment.

METHODS: We used DNA microarrays in a systematic study of gene-expression patterns in biopsy samples from normal and dysfunctional renal allografts. A combination of exploratory and supervised bioinformatic methods was used to analyze these profiles.

RESULTS: We found consistent differences among the gene-expression patterns associated with acute rejection, nephrotoxic effects of drugs, chronic allograft nephropathy, and normal kidneys. The gene-expression patterns associated with acute rejection suggested at least three possible distinct subtypes of acute rejection that, although indistinguishable by light microscopy, were marked by differences in immune activation and cellular proliferation. Since the gene-expression patterns pointed to substantial variation in the composition of immune infiltrates, we used immunohistochemical staining to define these subtypes further. This analysis revealed a striking association between dense CD20+ B-cell infiltrates and both clinical glucocorticoid resistance (P=0.01) and graft loss (P{$<$}0.001).

CONCLUSIONS: Systematic analysis of gene-expression patterns provides a window on the biology and pathogenesis of renal allograft rejection. Biopsy samples from patients with acute rejection that are indistinguishable on conventional histologic analysis reveal extensive differences in gene expression, which are associated with differences in immunologic and cellular features and clinical course. The presence of dense clusters of B cells in a biopsy sample was strongly associated with severe graft rejection, suggesting a pivotal role of infiltrating B cells in acute rejection.},
  author = {Sarwal, Minnie and Chua, Mei-Sze and Kambham, Neeraja and Hsieh, Szu-Chuan and Satterwhite, Thomas and Masek, Marilyn and Salvatierra, Oscar},
  date = {2003-07-10},
  doi = {10/b9gs7r},
  eprint = {12853585},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Sarwal et al. - 2003 - Molecular heterogeneity in acute renal allograft r.pdf},
  issn = {1533-4406},
  journaltitle = {The New England journal of medicine},
  keywords = {Acute Disease,Adolescent,Adult,Antigens; CD20,Antigens; CD20: analysis,B-Lymphocytes,B-Lymphocytes: immunology,Biopsy,Child,Child; Preschool,Computational Biology,Drug Resistance,Drug Resistance: immunology,Gene Expression,Gene Expression Profiling,Genetic Heterogeneity,Glucocorticoids,Glucocorticoids: therapeutic use,Graft Rejection,Graft Rejection: classification,Graft Rejection: drug therapy,Graft Rejection: genetics,Graft Rejection: immunology,Humans,Immunohistochemistry,Infant,Kidney,Kidney Transplantation,Kidney Transplantation: immunology,Kidney Transplantation: pathology,Kidney: immunology,Kidney: pathology,Oligonucleotide Array Sequence Analysis,T-Lymphocytes,Transplantation; Homologous},
  number = {2},
  pages = {125-38},
  title = {Molecular Heterogeneity in Acute Renal Allograft Rejection Identified by {{DNA}} Microarray Profiling.},
  volume = {349}
}

@unpublished{Sathe2013,
  author = {Sathe, Ganesh},
  date = {2013-06-19},
  file = {/Users/ryan/Documents/Zotero Library/Sathe - 2013 - Applications of RNA-Seq within Research & Developm.pdf},
  keywords = {⛔ No DOI found},
  title = {Applications of {{RNA}}-{{Seq}} within {{Research}} \& {{Development}} at {{GSK}}}
}

@article{Schatz2007,
  abstract = {The recent availability of new, less expensive high-throughput DNA sequencing technologies has yielded a dramatic increase in the volume of sequence data that must be analyzed. These data are being generated for several purposes, including genotyping, genome resequencing, metagenomics, and de novo genome assembly projects. Sequence alignment programs such as MUMmer have proven essential for analysis of these data, but researchers will need ever faster, high-throughput alignment tools running on inexpensive hardware to keep up with new sequence technologies.},
  author = {Schatz, Michael C and Trapnell, Cole and Delcher, Arthur L and Varshney, Amitabh},
  date = {2007-01},
  doi = {10/d4j3b6},
  eprint = {18070356},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Schatz et al. - 2007 - High-throughput sequence alignment using Graphics .pdf},
  issn = {1471-2105},
  journaltitle = {BMC bioinformatics},
  keywords = {Animals,Bacillus anthracis,Bacillus anthracis: genetics,Base Sequence,Caenorhabditis,Caenorhabditis: genetics,Computer Graphics,Computer Graphics: economics,Computer Graphics: instrumentation,Computers,Computers: economics,Contig Mapping,Contig Mapping: economics,Contig Mapping: instrumentation,Database Management Systems,Databases; Genetic,DNA,DNA: ultrastructure,Genomic Library,Listeria monocytogenes,Listeria monocytogenes: genetics,Sequence Alignment,Sequence Alignment: economics,Sequence Alignment: instrumentation,Sequence Alignment: methods,Sequence Analysis; DNA,Sequence Analysis; DNA: economics,Sequence Analysis; DNA: instrumentation,Sequence Analysis; DNA: methods,Streptococcus suis,Streptococcus suis: genetics,Time Factors,Work Simplification},
  pages = {474},
  title = {High-Throughput Sequence Alignment Using {{Graphics Processing Units}}.},
  volume = {8}
}

@article{Schmeing2009,
  abstract = {The high-resolution structures of ribosomal subunits published in 2000 have revolutionized the field of protein translation. They facilitated the determination and interpretation of functional complexes of the ribosome by crystallography and electron microscopy. Knowledge of the precise positions of residues in the ribosome in various states has facilitated increasingly sophisticated biochemical and genetic experiments, as well as the use of new methods such as single-molecule kinetics. In this review, we discuss how the interaction between structural and functional studies over the last decade has led to a deeper understanding of the complex mechanisms underlying translation.},
  author = {Schmeing, T Martin and Ramakrishnan, V},
  date = {2009-10-29},
  doi = {10/b29tv8},
  eprint = {19838167},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Schmeing and Ramakrishnan - 2009 - What recent ribosome structures have revealed abou.pdf},
  issn = {1476-4687},
  journaltitle = {Nature},
  keywords = {Bacterial Proteins,Bacterial Proteins: chemistry,Bacterial Proteins: metabolism,Biocatalysis,Protein Biosynthesis,Protein Biosynthesis: physiology,Ribosomal Proteins,Ribosomal Proteins: metabolism,Ribosomes,Ribosomes: chemistry,Ribosomes: metabolism,Structure-Activity Relationship},
  number = {7268},
  pages = {1234-42},
  title = {What Recent Ribosome Structures Have Revealed about the Mechanism of Translation.},
  volume = {461}
}

@article{schmidlLineagespecificDNAMethylation2009,
  abstract = {DNA methylation participates in establishing and maintaining chromatin structures and regulates gene transcription during mammalian development and cellular differentiation. With few exceptions, research thus far has focused on gene promoters, and little is known about the extent, functional relevance, and regulation of cell type-specific DNA methylation at promoter-distal sites. Here, we present a comprehensive analysis of differential DNA methylation in human conventional CD4+ T cells (Tconv) and CD4+CD25+ regulatory T cells (Treg), cell types whose differentiation and function are known to be controlled by epigenetic mechanisms. Using a novel approach that is based on the separation of a genome into methylated and unmethylated fractions, we examined the extent of lineage-specific DNA methylation across whole gene loci. More than 100 differentially methylated regions (DMRs) were identified that are present mainly in cell type-specific genes (e.g., FOXP3, IL2RA, CTLA4, CD40LG, and IFNG) and show differential patterns of histone H3 lysine 4 methylation. Interestingly, the majority of DMRs were located at promoter-distal sites, and many of these areas harbor DNA methylation-dependent enhancer activity in reporter gene assays. Thus, our study provides a comprehensive, locus-wide analysis of lineage-specific methylation patterns in Treg and Tconv cells, links cell type-specific DNA methylation with histone methylation and regulatory function, and identifies a number of cell type-specific, CpG methylation-sensitive enhancers in immunologically relevant genes.},
  author = {Schmidl, Christian and Klug, Maja and Boeld, Tina J. and Andreesen, Reinhard and Hoffmann, Petra and Edinger, Matthias and Rehli, Michael},
  date = {2009-01-07},
  doi = {10/b74h6x},
  eprint = {19494038},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Schmidl et al. - 2009 - Lineage-specific DNA methylation in T cells correl.pdf;/Users/ryan/Zotero/storage/U4NTTHJN/1165.html},
  ids = {Schmidl2009},
  issn = {1088-9051, 1549-5469},
  journaltitle = {Genome Research},
  keywords = {epigenetics,methylation},
  langid = {english},
  number = {7},
  pages = {1165-1174},
  shortjournal = {Genome Res.},
  title = {Lineage-Specific {{DNA}} Methylation in {{T}} Cells Correlates with Histone Methylation and Enhancer Activity},
  volume = {19}
}

@article{Schmitz2012,
  abstract = {Burkitt's lymphoma (BL) can often be cured by intensive chemotherapy, but the toxicity of such therapy precludes its use in the elderly and in patients with endemic BL in developing countries, necessitating new strategies. The normal germinal centre B cell is the presumed cell of origin for both BL and diffuse large B-cell lymphoma (DLBCL), yet gene expression analysis suggests that these malignancies may use different oncogenic pathways. BL is subdivided into a sporadic subtype that is diagnosed in developed countries, the Epstein-Barr-virus-associated endemic subtype, and an HIV-associated subtype, but it is unclear whether these subtypes use similar or divergent oncogenic mechanisms. Here we used high-throughput RNA sequencing and RNA interference screening to discover essential regulatory pathways in BL that cooperate with MYC, the defining oncogene of this cancer. In 70\% of sporadic BL cases, mutations affecting the transcription factor TCF3 (E2A) or its negative regulator ID3 fostered TCF3 dependency. TCF3 activated the pro-survival phosphatidylinositol-3-OH kinase pathway in BL, in part by augmenting tonic B-cell receptor signalling. In 38\% of sporadic BL cases, oncogenic CCND3 mutations produced highly stable cyclin D3 isoforms that drive cell cycle progression. These findings suggest opportunities to improve therapy for patients with BL.},
  author = {Schmitz, Roland and Young, Ryan M and Ceribelli, Michele and Jhavar, Sameer and Xiao, Wenming and Zhang, Meili and Wright, George and Shaffer, Arthur L and Hodson, Daniel J and Buras, Eric and Liu, Xuelu and Powell, John and Yang, Yandan and Xu, Weihong and Zhao, Hong and Kohlhammer, Holger and Rosenwald, Andreas and Kluin, Philip and M\"uller-Hermelink, Hans Konrad and Ott, German and Gascoyne, Randy D and Connors, Joseph M and Rimsza, Lisa M and Campo, Elias and Jaffe, Elaine S and Delabie, Jan and Smeland, Erlend B and Ogwang, Martin D and Reynolds, Steven J and Fisher, Richard I and Braziel, Rita M and Tubbs, Raymond R and Cook, James R and Weisenburger, Dennis D and Chan, Wing C and Pittaluga, Stefania and Wilson, Wyndham and a Waldmann, Thomas and Rowe, Martin and Mbulaiteye, Sam M and Rickinson, Alan B and Staudt, Louis M},
  date = {2012-08-12},
  doi = {10/f38xrh},
  eprint = {22885699},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Schmitz et al. - 2012 - Burkitt lymphoma pathogenesis and therapeutic targ.pdf},
  issn = {1476-4687},
  journaltitle = {Nature},
  pages = {3-7},
  title = {Burkitt Lymphoma Pathogenesis and Therapeutic Targets from Structural and Functional Genomics.}
}

@article{Schneider2017,
  abstract = {The human reference genome assembly plays a central role in nearly all aspects of today's basic and clinical research. GRCh38 is the first coordinate-changing assembly update since 2009; it reflects the resolution of roughly 1000 issues and encompasses modifications ranging from thousands of single base changes to megabase-scale path reorganizations, gap closures, and localization of previously orphaned sequences. We developed a new approach to sequence generation for targeted base updates and used data from new genome mapping technologies and single haplotype resources to identify and resolve larger assembly issues. For the first time, the reference assembly contains sequence-based representations for the centromeres. We also expanded the number of alternate loci to create a reference that provides a more robust representation of human population variation. We demonstrate that the updates render the reference an improved annotation substrate, alter read alignments in unchanged regions, and impact variant interpretation at clinically relevant loci. We additionally evaluated a collection of new de novo long-read haploid assemblies and conclude that although the new assemblies compare favorably to the reference with respect to continuity, error rate, and gene completeness, the reference still provides the best representation for complex genomic regions and coding sequences. We assert that the collected updates in GRCh38 make the newer assembly a more robust substrate for comprehensive analyses that will promote our understanding of human biology and advance our efforts to improve health.},
  author = {Schneider, Valerie A. and Graves-Lindsay, Tina and Howe, Kerstin and Bouk, Nathan and Chen, Hsiu Chuan and Kitts, Paul A. and Murphy, Terence D. and Pruitt, Kim D. and Thibaud-Nissen, Fran{\c c}oise and Albracht, Derek and Fulton, Robert S. and Kremitzki, Milinn and Magrini, Vincent and Markovic, Chris and McGrath, Sean and Steinberg, Karyn Meltz and Auger, Kate and Chow, William and Collins, Joanna and Harden, Glenn and Hubbard, Timothy and Pelan, Sarah and Simpson, Jared T. and Threadgold, Glen and Torrance, James and Wood, Jonathan M. and Clarke, Laura and Koren, Sergey and Boitano, Matthew and Peluso, Paul and Li, Heng and Chin, Chen Shan and Phillippy, Adam M. and Durbin, Richard and Wilson, Richard K. and Flicek, Paul and Eichler, Evan E. and Church, Deanna M.},
  date = {2017},
  doi = {10/f92cmg},
  eprint = {28396521},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Schneider et al. - 2017 - Evaluation of GRCh38 and de novo haploid genome as.pdf},
  issn = {15495469},
  journaltitle = {Genome Research},
  keywords = {annotation,bioinformatics,biology,clinical research,genetics,genomics,haplotype,human biology,locus,ranging,reference genome},
  number = {5},
  pages = {849-864},
  title = {Evaluation of {{GRCh38}} and de Novo Haploid Genome Assemblies Demonstrates the Enduring Quality of the Reference Assembly},
  volume = {27}
}

@article{Schroder2004,
  abstract = {Interferon-gamma (IFN-gamma) coordinates a diverse array of cellular programs through transcriptional regulation of immunologically relevant genes. This article reviews the current understanding of IFN-gamma ligand, receptor, signal transduction, and cellular effects with a focus on macrophage responses and to a lesser extent, responses from other cell types that influence macrophage function during infection. The current model for IFN-gamma signal transduction is discussed, as well as signal regulation and factors conferring signal specificity. Cellular effects of IFN-gamma are described, including up-regulation of pathogen recognition, antigen processing and presentation, the antiviral state, inhibition of cellular proliferation and effects on apoptosis, activation of microbicidal effector functions, immunomodulation, and leukocyte trafficking. In addition, integration of signaling and response with other cytokines and pathogen-associated molecular patterns, such as tumor necrosis factor-alpha, interleukin-4, type I IFNs, and lipopolysaccharide are discussed.},
  author = {Schroder, Kate and Hertzog, Paul J and Ravasi, Timothy and Hume, David A},
  date = {2004-02},
  doi = {10/b8gt4p},
  eprint = {14525967},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Schroder et al. - 2004 - Interferon-gamma an overview of signals, mechanis.pdf},
  issn = {0741-5400},
  journaltitle = {Journal of leukocyte biology},
  keywords = {Animals,Gene Expression Regulation,Gene Expression Regulation: immunology,Humans,Inflammation,Inflammation: immunology,Interferon-gamma,Interferon-gamma: immunology,Interferon-gamma: physiology,Macrophages,Macrophages: immunology,Receptor Cross-Talk,Receptor Cross-Talk: immunology,Signal Transduction,Signal Transduction: immunology},
  number = {2},
  pages = {163-89},
  title = {Interferon-Gamma: An Overview of Signals, Mechanisms and Functions.},
  volume = {75}
}

@article{Sciuto2018,
  abstract = {Coimmunoprecipitation (co-IP) is one of the most frequently used techniques to study protein-protein (PPIs) or protein-nucleic acid interactions (PNIs). However, the presence of coprecipitated contaminants is a well-recognized issue associated with single-step co-IPs. To overcome this limitation, we developed the two-step co-IP (TIP) strategy that enables sequential coimmunoprecipitations of endogenous protein complexes. TIP can be performed with a broad range of mono- and polyclonal antibodies targeting a single protein or different components of a given complex. TIP results in a highly selective enrichment of protein complexes and thus outperforms single-step co-IPs for downstream applications such as mass spectrometry for the identification of PPIs and quantitative PCR for the analysis of PNIs. We bench-marked TIP for the identification of CD95/FAS-interact-ing proteins in primary human CD4 T cells, which recapitulated all major known interactors, but also enabled the proteomics discovery of PPM1G and IPO7 as new interaction partners. For its feasibility and high performance, we propose TIP as an advanced tool for the isolation of highly purified protein-protein and protein-nucleic acid complexes under native expression conditions.},
  author = {Sciuto, Maria Rita and Warnken, Uwe and Schn\"olzer, Martina and Valvo, Cecilia and Brunetto, Lidia and Boe, Alessandra and Biffoni, Mauro and Krammer, Peter H. and De Maria, Ruggero and Haas, Tobias L.},
  date = {2018},
  doi = {10/gdhwpm},
  file = {/Users/ryan/Documents/Zotero Library/Sciuto et al. - 2018 - Two-Step Coimmunoprecipitation (TIP) enables effic.pdf},
  issn = {15359484},
  journaltitle = {Molecular and Cellular Proteomics},
  number = {5},
  pages = {993-1009},
  title = {Two-{{Step Coimmunoprecipitation}} ({{TIP}}) Enables Efficient and Highly Selective Isolation of Native Protein Complexes},
  volume = {17}
}

@article{Scott2015,
  author = {Scott, James G. and Kelly, Ryan C. and Smith, Matthew A. and Zhou, Pengcheng and Kass, Robert E.},
  date = {2015-04-03},
  doi = {10/f7h4n5},
  file = {/Users/ryan/Documents/Zotero Library/Scott et al. - 2015 - False Discovery Rate Regression An Application to.pdf},
  issn = {0162-1459},
  journaltitle = {Journal of the American Statistical Association},
  number = {510},
  pages = {459-471},
  title = {False {{Discovery Rate Regression}}: {{An Application}} to {{Neural Synchrony Detection}} in {{Primary Visual Cortex}}},
  volume = {110}
}

@article{Scott2016,
  author = {Scott, Erick R and Larman, H Benjamin and Torkamani, Ali and Schork, Nicholas J and Wineinger, Nathan and Nanis, Max and Thompson, Ryan C. and Beheshti Zavareh, Reza B. and Lairson, Luke L and Schultz, Peter G and Su, Andrew I.},
  date = {2016},
  doi = {10/ggcxmn},
  file = {/Users/ryan/Documents/Zotero Library/Scott et al. - 2016 - RASLseqTools open-source methods for designing an.pdf},
  journaltitle = {bioRxiv},
  title = {{{RASLseqTools}}: Open-Source Methods for Designing and Analyzing {{RNA}}-Mediated Oligonucleotide {{Annealing}}, {{Selection}}, and, {{Ligation}} Sequencing ({{RASL}}-Seq) Experiments}
}

@article{Sculley,
  author = {Sculley, D and Holt, Gary and Golovin, Daniel and Davydov, Eugene and Phillips, Todd and Ebner, Dietmar and Chaudhary, Vinay and Young, Michael and Dennison, Dan},
  file = {/Users/ryan/Documents/Zotero Library/Sculley et al. - Hidden Technical Debt in Machine Learning Systems.pdf},
  pages = {1-9},
  title = {Hidden {{Technical Debt}} in {{Machine Learning Systems}}}
}

@article{Searle2008,
  author = {Searle, Brian C and Turner, Mark and Nesvizhskii, Alexey I},
  date = {2008},
  doi = {10/c636pp},
  file = {/Users/ryan/Documents/Zotero Library/Searle et al. - 2008 - Improving Sensitivity by Probabilistically Combini.pdf},
  journaltitle = {Journal of Proteome Research},
  keywords = {bioinfor-,database searching,mascot,mass spectrometry,matics,peptide identification,probability,protein identification,proteomics,sequest,tandem,x},
  pages = {245-253},
  title = {Improving {{Sensitivity}} by {{Probabilistically Combining Results}} from {{Multiple MS}} / {{MS Search Methodologies}} Research Articles}
}

@article{Semedo2010,
  abstract = {One of the early phases that lead to fibrosis progression is inflammation. Once this stage is resolved, fibrosis might be prevented. Bone marrow mononuclear cells (BMMCs) are emerging as a new therapy for several pathologies, including autoimmune diseases, because they enact immunosuppression. In this study we aimed to evaluate the role of BMMC administration in a model of kidney fibrosis induced by an acute injury. C57Bl6 mice were subjected to unilateral severe ischemia by clamping the left renal pedicle for 1h. BMMCs were isolated from femurs and tibia, and after 6h of reperfusion, 1 x 10(6) cells were administrated intraperitoneally. At 24h after surgery, treated animals showed a significant decrease in creatinine and urea levels when compared with untreated animals. Different administration routes were tested. Moreover, interferon (IFN) receptor knockout BMMCs were used, as this receptor is necessary for BMMC activation. Labeled BMMCs were found in ischemic kidney on FACS analysis. This improved outcome was associated with modulation of inflammation in the kidney and systemic modulation, as determined by cytokine expression profiling. Despite non-amelioration of functional parameters, kidney mRNA expression of interleukin (IL)-6 at 6 weeks was lower in BMMC-treated animals, as were levels of collagen 1, connective tissue growth factor (CTGF), transforming growth factor-beta (TGF-beta) and vimentin. Protective molecules, such as IL-10, heme oxygenase 1 (HO-1) and bone morphogenetic 7 (BMP-7), were increased in treated animals after 6 weeks. Moreover, Masson and Picrosirius red staining analyses showed less fibrotic areas in the kidneys of treated animals. Thus, early modulation of inflammation by BMMCs after an ischemic injury leads to reduced fibrosis through modulation of early inflammation.},
  author = {Semedo, Patricia and Donizetti-Oliveira, Cassiano and Burgos-Silva, Marina and Cenedeze, Marco Antonio and Avancini Costa Malheiros, Denise Maria and Pacheco-Silva, Alvaro and C\^amara, Niels Olsen Saraiva},
  date = {2010-05},
  doi = {10/dbms3t},
  eprint = {20308984},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Semedo et al. - 2010 - Bone marrow mononuclear cells attenuate fibrosis d.pdf},
  issn = {1530-0307},
  journaltitle = {Laboratory investigation; a journal of technical methods and pathology},
  keywords = {Acute Disease,acute-kidney-injury,Animals,Antigens; CD34,Antigens; CD34: analysis,Bone Marrow Cells,Bone Marrow Cells: cytology,Bone Marrow Cells: metabolism,Bone Morphogenetic Protein 7,Bone Morphogenetic Protein 7: genetics,Bone Morphogenetic Protein 7: metabolism,Cell Transplantation,Cell Transplantation: methods,Cells; Cultured,cyno-project,Cytokines,Cytokines: genetics,Cytokines: metabolism,Female,Fibrosis,Fibrosis: surgery,Gene Expression,Heme Oxygenase-1,Heme Oxygenase-1: genetics,Heme Oxygenase-1: metabolism,Immunohistochemistry,Immunophenotyping,Ischemia,Ischemia: complications,Kidney,Kidney Diseases,Kidney Diseases: etiology,Kidney Diseases: surgery,Kidney: blood supply,Kidney: pathology,Leukocytes; Mononuclear,Leukocytes; Mononuclear: cytology,Leukocytes; Mononuclear: metabolism,Leukocytes; Mononuclear: transplantation,Male,Mice,Mice; Inbred C57BL,Proto-Oncogene Proteins c-kit,Proto-Oncogene Proteins c-kit: analysis,Reverse Transcriptase Polymerase Chain Reaction},
  number = {5},
  pages = {685-95},
  title = {Bone Marrow Mononuclear Cells Attenuate Fibrosis Development after Severe Acute Kidney Injury.},
  volume = {90}
}

@book{sequencereadarchivesubmissionsstaffUsingSRAToolkit2011,
  author = {{Sequence Read Archive Submissions Staff}},
  date = {2011},
  file = {/Users/ryan/Documents/Zotero Library/Sequence Read Archive Submissions Staff - 2011 - Using the SRA Toolkit to convert .sra files into o.pdf},
  ids = {sra-toolkit},
  langid = {english},
  publisher = {{National Center for Biotechnology Information (US)}},
  title = {Using the {{SRA Toolkit}} to Convert .Sra Files into Other Formats},
  url = {https://www.ncbi.nlm.nih.gov/books/NBK158900/},
  urldate = {2019-11-14}
}

@article{Sfeir2012,
  abstract = {The telomere end-protection problem is defined by the aggregate of DNA damage signaling and repair pathways that require repression at telomeres. To define the end-protection problem, we removed the whole shelterin complex from mouse telomeres through conditional deletion of TRF1 and TRF2 in nonhomologous end-joining (NHEJ) deficient cells. The data reveal two DNA damage response pathways not previously observed upon deletion of individual shelterin proteins. The shelterin-free telomeres are processed by microhomology-mediated alternative-NHEJ when Ku70/80 is absent and are attacked by nucleolytic degradation in the absence of 53BP1. The data establish that the end-protection problem is specified by six pathways [ATM (ataxia telangiectasia mutated) and ATR (ataxia telangiectasia and Rad3 related) signaling, classical-NHEJ, alt-NHEJ, homologous recombination, and resection] and show how shelterin acts with general DNA damage response factors to solve this problem.},
  author = {Sfeir, Agnel and de Lange, Titia},
  date = {2012-05-04},
  doi = {10/f3w5nz},
  eprint = {22556254},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Sfeir and de Lange - 2012 - Removal of shelterin reveals the telomere end-prot.pdf},
  issn = {1095-9203},
  journaltitle = {Science (New York, N.Y.)},
  keywords = {Animals,Antigens; Nuclear,Antigens; Nuclear: genetics,Antigens; Nuclear: metabolism,Cell Cycle,Cell Cycle Proteins,Cell Cycle Proteins: metabolism,Cells; Cultured,Chromosomal Proteins; Non-Histone,Chromosomal Proteins; Non-Histone: metabolism,DNA Breaks; Double-Stranded,DNA End-Joining Repair,DNA Ligases,DNA Ligases: metabolism,DNA Repair,DNA-Binding Proteins,DNA-Binding Proteins: genetics,DNA-Binding Proteins: metabolism,Homologous Recombination,Mice,Mice; Knockout,Poly(ADP-ribose) Polymerases,Poly(ADP-ribose) Polymerases: metabolism,Protein-Serine-Threonine Kinases,Protein-Serine-Threonine Kinases: metabolism,Signal Transduction,Telomere,Telomere Homeostasis,Telomere-Binding Proteins,Telomere-Binding Proteins: genetics,Telomere-Binding Proteins: metabolism,Telomere: metabolism,Telomere: ultrastructure,Telomeric Repeat Binding Protein 1,Telomeric Repeat Binding Protein 1: genetics,Telomeric Repeat Binding Protein 1: metabolism,Telomeric Repeat Binding Protein 2,Telomeric Repeat Binding Protein 2: genetics,Telomeric Repeat Binding Protein 2: metabolism,Tumor Suppressor Proteins,Tumor Suppressor Proteins: metabolism},
  number = {6081},
  options = {useprefix=true},
  pages = {593-7},
  title = {Removal of Shelterin Reveals the Telomere End-Protection Problem.},
  volume = {336}
}

@article{Shao2012,
  author = {Shao, Zhen and Zhang, Yijing and Yuan, Guo-Cheng and Orkin, Stuart H and Waxman, David J},
  date = {2012},
  doi = {10/gf9z7s},
  file = {/Users/ryan/Documents/Zotero Library/Shao et al. - 2012 - MAnorm a robust model for quantitative comparison.pdf},
  issn = {1465-6906},
  journaltitle = {Genome Biology},
  number = {3},
  pages = {R16},
  title = {{{MAnorm}}: A Robust Model for Quantitative Comparison of {{ChIP}}-{{Seq}} Data Sets},
  volume = {13}
}

@article{Shen2012,
  abstract = {Ultra-deep RNA sequencing has become a powerful approach for genome-wide analysis of pre-mRNA alternative splicing. We develop MATS (multivariate analysis of transcript splicing), a bayesian statistical framework for flexible hypothesis testing of differential alternative splicing patterns on RNA-Seq data. MATS uses a multivariate uniform prior to model the between-sample correlation in exon splicing patterns, and a Markov chain Monte Carlo (MCMC) method coupled with a simulation-based adaptive sampling procedure to calculate the P-value and false discovery rate (FDR) of differential alternative splicing. Importantly, the MATS approach is applicable to almost any type of null hypotheses of interest, providing the flexibility to identify differential alternative splicing events that match a given user-defined pattern. We evaluated the performance of MATS using simulated and real RNA-Seq data sets. In the RNA-Seq analysis of alternative splicing events regulated by the epithelial-specific splicing factor ESRP1, we obtained a high RT-PCR validation rate of 86\% for differential exon skipping events with a MATS FDR of {$<$}10\%. Additionally, over the full list of RT-PCR tested exons, the MATS FDR estimates matched well with the experimental validation rate. Our results demonstrate that MATS is an effective and flexible approach for detecting differential alternative splicing from RNA-Seq data.},
  author = {Shen, Shihao and Park, Juw Won and Huang, Jian and a Dittmar, Kimberly and Lu, Zhi-xiang and Zhou, Qing and Carstens, Russ P and Xing, Yi},
  date = {2012-04},
  doi = {10/fztk53},
  eprint = {22266656},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Shen et al. - 2012 - MATS a Bayesian framework for flexible detection .pdf},
  issn = {1362-4962},
  journaltitle = {Nucleic acids research},
  keywords = {Alternative Splicing,Bayes Theorem,Brain,Brain: metabolism,Cell Line; Tumor,High-Throughput Nucleotide Sequencing,Humans,Multivariate Analysis,Reverse Transcriptase Polymerase Chain Reaction,RNA-Binding Proteins,RNA-Binding Proteins: metabolism,Sequence Analysis; RNA},
  number = {8},
  pages = {e61},
  title = {{{MATS}}: A {{Bayesian}} Framework for Flexible Detection of Differential Alternative Splicing from {{RNA}}-{{Seq}} Data.},
  volume = {40}
}

@article{Shen2013,
  abstract = {ChIP-seq is increasingly being used for genome-wide profiling of histone modification marks. It is of particular importance to compare ChIP-seq data of two different conditions, such as disease vs. control, and identify regions that show differences in ChIP enrichment. We have developed a powerful and easy to use program, called diffReps, to detect those differential sites from ChIP-seq data, with or without biological replicates. In addition, we have developed two useful tools for ChIP-seq analysis in the diffReps package: one for the annotation of the differential sites and the other for finding chromatin modification "hotspots". diffReps is developed in PERL programming language and runs on all platforms as a command line script. We tested diffReps on two different datasets. One is the comparison of H3K4me3 between two human cell lines from the ENCODE project. The other is the comparison of H3K9me3 in a discrete region of mouse brain between cocaine- and saline-treated conditions. The results indicated that diffReps is a highly sensitive program in detecting differential sites from ChIP-seq data.},
  author = {Shen, Li and Shao, Ning-Yi and Liu, Xiaochuan and Maze, Ian and Feng, Jian and Nestler, Eric J},
  date = {2013-01},
  doi = {10/f473qt},
  eprint = {23762400},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Shen et al. - 2013 - diffReps Detecting Differential Chromatin Modific.pdf},
  issn = {1932-6203},
  journaltitle = {PloS one},
  number = {6},
  pages = {e65598},
  title = {{{diffReps}}: {{Detecting Differential Chromatin Modification Sites}} from {{ChIP}}-Seq {{Data}} with {{Biological Replicates}}.},
  volume = {8}
}

@article{Sheng2013,
  abstract = {We propose an adaptive truncated product method that facilitates the selection of the truncation point among a set of candidates. To efficiently estimate the distribution of the proposed method when the p-values are correlated, we develop a single-layer bootstrap procedure.},
  author = {Sheng, Xuguang and Yang, Jingyun},
  date = {2013},
  eprint = {23935232},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Sheng and Yang - 2013 - An adaptive truncated product method for combining.pdf},
  issn = {0165-1765},
  journaltitle = {Economics letters},
  keywords = {adaptive,aman ullah,eric renault,from discussions with ramo,gencay,jonathan wallen has provid-,jonathan wright and dmitri,p-value,panel cointegration,purchasing power parity,truncated product method,we have benefited greatly,zaykin},
  number = {2},
  pages = {180-182},
  title = {An Adaptive Truncated Product Method for Combining Dependent P-Values.},
  volume = {119}
}

@article{Shi2013,
  author = {Shi, Yang and Jiang, Hui},
  date = {2013-11-18},
  doi = {10/f5mtdc},
  editor = {Emmert-Streib, Frank},
  file = {/Users/ryan/Documents/Zotero Library/Shi and Jiang - 2013 - rSeqDiff Detecting Differential Isoform Expressio.pdf},
  issn = {1932-6203},
  journaltitle = {PLoS ONE},
  number = {11},
  pages = {e79448},
  title = {{{rSeqDiff}}: {{Detecting Differential Isoform Expression}} from {{RNA}}-{{Seq Data Using Hierarchical Likelihood Ratio Test}}},
  volume = {8}
}

@article{Shin2012,
  abstract = {Butyrate-induced histone acetylation plays an important role in the regulation of gene expression. However, the regulation mechanisms of histone modification remain largely unclear. To comprehensively analyze histone modification induced by butyrate, we utilized chromatin immunoprecipitation (ChIP) technology combined with next-generation sequencing technology (ChIP-seq) to analyze histone modification (acetylation) induced by butyrate and to map the epigenomic landscape of normal histone H3 and acetylated histone H3K9 and H3K27 on a large scale. To determine the location of histone H3, acetyl-H3K9, and acetyl-H3K27 binding sites within the bovine genome, we analyzed the H3-, acetyl-H3K9-, and acetyl-H3K27-enriched binding regions in the proximal promoter within 5 kb upstream, or at the 5' untranslated region (UTR) from the transcriptional start site (TSS), exon, intron, and intergenic regions (defined as regions 25 kb upstream or 10 kb downstream from the TSS). Our analysis indicated that the distribution of histone H3, acetyl-H3K9, and acetyl-H3K27 correlated with transcription activity induced by butyrate. Using the GADEM algorithm, several motifs were generated for each of the ChIP-seq datasets. A de novo search for H3, acetyl-H3K9, and acetyl-H3K27 binding motifs indicated that histone modification (acetylation) at various locations changes the histone H3 binding preferences. Our results reveal that butyrate-induced acetylation in H3K9 and H3K27 changes the sequence-based binding preference of histone H3 and underlies the potential mechanisms of gene expression regulation induced by butyrate.},
  author = {Shin, Joo Heon and Li, Robert W. and Gao, Yuan and Baldwin VI, Ransom and Li, Cong Jun},
  date = {2012},
  doi = {10/fzpvc4},
  eprint = {22249597},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Shin et al. - 2012 - Genome-wide ChIP-seq mapping and analysis reveal b.pdf},
  isbn = {1438-7948 (Electronic)\textbackslash{}r1438-793X (Linking)},
  issn = {1438793X},
  journaltitle = {Functional and Integrative Genomics},
  keywords = {Bovine,Butyrate,ChIP-seq,Epigenomics,Histone acetylation},
  number = {1},
  pages = {119-130},
  title = {Genome-Wide {{ChIP}}-Seq Mapping and Analysis Reveal Butyrate-Induced Acetylation of {{H3K9}} and {{H3K27}} Correlated with Transcription Activity in Bovine Cells},
  volume = {12}
}

@article{Shin2014,
  abstract = {Background: The molecular profile of circulating blood can reflect physiological and pathological events occurring in other tissues and organs of the body and delivers a comprehensive view of the status of the immune system. Blood has been useful in studying the pathobiology of many diseases. It is accessible and easily collected making it ideally suited to the development of diagnostic biomarker tests. The blood transcriptome has a high complement of globin RNA that could potentially saturate next-generation sequencing platforms, masking lower abundance transcripts. Methods to deplete globin mRNA are available, but their effect has not been comprehensively studied in peripheral whole blood RNA-Seq data. In this study we aimed to assess technical variability associated with globin depletion in addition to assessing general technical variability in RNA-Seq from whole blood derived samples. Results: We compared technical and biological replicates having undergone globin depletion or not and found that the experimental globin depletion protocol employed removed approximately 80\% of globin transcripts, improved the correlation of technical replicates, allowed for reliable detection of thousands of additional transcripts and generally increased transcript abundance measures. Differential expression analysis revealed thousands of genes significantly upregulated as a result of globin depletion. In addition, globin depletion resulted in the down-regulation of genes involved in both iron and zinc metal ion bonding. Conclusions: Globin depletion appears to meaningfully improve the quality of peripheral whole blood RNA-Seq data, and may improve our ability to detect true biological variation. Some concerns remain, however. Key amongst them the significant reduction in RNA yields following globin depletion. More generally, our investigation of technical and biological variation with and without globin depletion finds that high-throughput sequencing by RNA-Seq is highly reproducible within a large dynamic range of detection and provides an accurate estimation of RNA concentration in peripheral whole blood. High-throughput sequencing is thus a promising technology for whole blood transcriptomics and biomarker discovery. \textcopyright{} 2014 Shin et al.},
  author = {Shin, Heesun and Shannon, Casey P. and Fishbane, Nick and Ruan, Jian and Zhou, Mi and Balshaw, Robert and Wilson-McManus, Janet E. and Ng, Raymond T. and McManus, Bruce M. and Tebbutt, Scott J.},
  date = {2014},
  doi = {10/ggcxmp},
  file = {/Users/ryan/Documents/Zotero Library/Shin et al. - 2014 - Variation in RNA-Seq transcriptome profiles of per.pdf},
  issn = {19326203},
  journaltitle = {PLoS ONE},
  number = {3},
  pages = {1-11},
  title = {Variation in {{RNA}}-{{Seq}} Transcriptome Profiles of Peripheral Whole Blood from Healthy Individuals with and without Globin Depletion},
  volume = {9}
}

@software{ShoalImprovedMultisample2018,
  abstract = {shoal is a tool which jointly quantify transcript abundances across multiple samples. Specifically, shoal learns an empirical prior on transcript-level abundances across all of the samples in an experiment, and subsequently applies a variant of the variational Bayesian expectation maximization algorithm to apply this prior adaptively across multi-mapping groups of reads. shoal can increase quantification accuracy, inter-sample consistency, and reduce false positives in downstream differential analysis when applied to multi-condition RNA-seq experiments. Moreover, shoal, runs downstream of Salmon and requires less than a minute per-sample to re-estimate transcript abundances while accounting for the learned empirical prior.},
  date = {2018-11-28T05:01:02Z},
  ids = {gh-shoal},
  keywords = {⛔ No DOI found,rnaseq},
  organization = {{COMBINE-lab}},
  origdate = {2016-11-01T20:50:44Z},
  title = {Shoal: {{Improved}} Multi-Sample Transcript Abundance Estimates Using Adaptive Priors},
  url = {https://github.com/COMBINE-lab/shoal},
  urldate = {2019-11-14}
}

@article{Si2013,
  abstract = {The recent RNA-seq technology is an attractive method to study gene expression. One of the most important goals in RNA-seq data analysis is to detect genes differentially expressed across treatments. Although several statistical methods have been published, there are no theoretical justifications for whether these methods are optimal or how to search for the optimal test. Furthermore, most proposed tests are designed for testing whether the mean expression levels are exactly the same or not across treatments, whereas sometimes, biologists are interested in detecting genes with expression changes larger than a certain threshold. Another issue with current methods is that the false discovery rate (FDR) control is not well studied. In this manuscript, we propose a test to address all the above issues. Under model assumptions, we derive an optimal test that achieves the maximum of average power among those that control FDR at the same level. We also provide an approximated version, the approximated most average powerful (AMAP) test, for practical implementation. The proposed method allows for testing null hypotheses that are much more general than the ones most previous studies have considered, and it leads to a natural way of controlling the FDR. Through simulation studies, we show that our test has a higher power than other methods, including the widely-used edgeR, DESeq, and baySeq methods, as well as better FDR control than two other FDR control procedures commonly used in practice. For demonstration, we also apply the proposed method to a real RNA-seq dataset obtained from maize.},
  author = {Si, Yaqing and Liu, Peng},
  date = {2013-07-26},
  doi = {10/f5mr2z},
  eprint = {23889143},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Si and Liu - 2013 - An optimal test with maximum average power while c.pdf},
  issn = {1541-0420},
  journaltitle = {Biometrics},
  keywords = {empirical bayes,fdr control,gene expression,maximum average power,rna-seq},
  pages = {1-12},
  title = {An Optimal Test with Maximum Average Power While Controlling {{FDR}} with Application to {{RNA}}-Seq Data.}
}

@article{Si2013a,
  abstract = {MOTIVATION: RNA-seq technology has been widely adopted as an attractive alternative to microarray-based methods to study global gene expression. However, robust statistical tools to analyze these complex datasets are still lacking. By grouping genes with similar expression profiles across treatments, cluster analysis provides insight into gene functions and networks and hence is an important technique for RNA-seq data analysis.

RESULTS: In this manuscript, we derive clustering algorithms based on appropriate probability models for RNA-seq data. An Expectation-Maximization (EM) algorithm and another two stochastic versions of EM algorithms are described. In addition, a strategy for initialization based on likelihood is proposed to improve the clustering algorithms. Moreover, we present a model-based hybrid-hierarchical clustering method to generate a tree structure that allows visualization of relationships among clusters as well as flexibility of choosing the number of clusters. Results from both simulation studies and analysis of a maize RNA-seq dataset show that our proposed methods provide better clustering results than alternative methods such as the K-means algorithm and hierarchical clustering methods that are not based on probability models.

AVAILABILITY: An R package, MBCluster.Seq, has been developed to implement our proposed algorithms. This R package provides fast computation and is publicly available at http://www.r-project.org.

CONTACT: siyaqing@iastate.edu and pliu@iastate.edu.},
  author = {Si, Yaqing and Liu, Peng and Li, Pinghua and Brutnell, Thomas P},
  date = {2013-11-04},
  doi = {10/f5qw64},
  eprint = {24191069},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Si et al. - 2013 - Model-Based Clustering for RNA-Seq Data..pdf},
  issn = {1367-4811},
  journaltitle = {Bioinformatics (Oxford, England)},
  pages = {1-9},
  title = {Model-{{Based Clustering}} for {{RNA}}-{{Seq Data}}.}
}

@article{Siegel2014,
  author = {Siegel, T Nicolai and Hon, Chung-Chau and Zhang, Qinfeng and Lopez-Rubio, Jose-Juan and Scheidig-Benatar, Christine and Martins, Rafael M and Sismeiro, Odile and Copp\'ee, Jean-Yves and Scherf, Artur},
  date = {2014},
  doi = {10/f5xmnb},
  file = {/Users/ryan/Documents/Zotero Library/Siegel et al. - 2014 - Strand-specific RNA-Seq reveals widespread and dev.pdf},
  issn = {1471-2164},
  journaltitle = {BMC Genomics},
  number = {1},
  pages = {150},
  title = {Strand-Specific {{RNA}}-{{Seq}} Reveals Widespread and Developmentally Regulated Transcription of Natural Antisense Transcripts in {{Plasmodium}} Falciparum},
  volume = {15}
}

@article{Siepel2007,
  abstract = {A complete and accurate set of human protein-coding gene annotations is perhaps the single most important resource for genomic research after the human-genome sequence itself, yet the major gene catalogs remain incomplete and imperfect. Here we describe a genome-wide effort, carried out as part of the Mammalian Gene Collection (MGC) project, to identify human genes not yet in the gene catalogs. Our approach was to produce gene predictions by algorithms that rely on comparative sequence data but do not require direct cDNA evidence, then to test predicted novel genes by RT-PCR. We have identified 734 novel gene fragments (NGFs) containing 2188 exons with, at most, weak prior cDNA support. These NGFs correspond to an estimated 563 distinct genes, of which {$>$}160 are completely absent from the major gene catalogs, while hundreds of others represent significant extensions of known genes. The NGFs appear to be predominantly protein-coding genes rather than noncoding RNAs, unlike novel transcribed sequences identified by technologies such as tiling arrays and CAGE. They tend to be expressed at low levels and in a tissue-specific manner, and they are enriched for roles in motor activity, cell adhesion, connective tissue, and central nervous system development. Our results demonstrate that many important genes and gene fragments have been missed by traditional approaches to gene discovery but can be identified by their evolutionary signatures using comparative sequence data. However, they suggest that hundreds-not thousands-of protein-coding genes are completely missing from the current gene catalogs.},
  author = {Siepel, Adam and Diekhans, Mark and Brejov\'a, Brona and Langton, Laura and Stevens, Michael and Comstock, Charles L G and Davis, Colleen and Ewing, Brent and Oommen, Shelly and Lau, Christopher and Yu, Hung-Chun and Li, Jianfeng and a Roe, Bruce and Green, Phil and Gerhard, Daniela S and Temple, Gary and Haussler, David and Brent, Michael R},
  date = {2007-12},
  doi = {10/cq8hj7},
  eprint = {17989246},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Siepel et al. - 2007 - Targeted discovery of novel human exons by compara.pdf},
  issn = {1088-9051},
  journaltitle = {Genome research},
  keywords = {Animals,Base Sequence,Chickens,Chickens: genetics,Computational Biology,Exons,Exons: genetics,Expressed Sequence Tags,Genome; Human,Genomics,Humans,Mice,Predictive Value of Tests,Rats,Reverse Transcriptase Polymerase Chain Reaction,Zebrafish,Zebrafish: embryology,Zebrafish: genetics},
  number = {12},
  pages = {1763-73},
  title = {Targeted Discovery of Novel Human Exons by Comparative Genomics.},
  volume = {17}
}

@article{Silberzahn2018,
  abstract = {Twenty-nine teams involving 61 analysts used the same dataset to address the same research question: whether soccer referees are more likely to give red cards to dark skin toned players than light skin toned players. Analytic approaches varied widely across teams, and estimated effect sizes ranged from 0.89 to 2.93 in odds ratio units, with a median of 1.31. Twenty teams (69\%) found a statistically significant positive effect and nine teams (31\%) observed a non- significant relationship. Crowdsourcing data analysis, a strategy by which numerous research teams are recruited to simultaneously investigate the same research question, makes transparent how variations in analytical choices affect results.},
  author = {Silberzahn, R. and Uhlmann, E. L. and Martin, D. P. and Anselmi, P. and Aust, F. and Awtrey, E. and Bahn\'ik, {\v S}. and Bai, F. and Bannard, C. and Bonnier, E. and Carlsson, R. and Cheung, F. and Christensen, G. and Clay, R. and Craig, M. A. and Dalla Rosa, A. and Dam, L. and Evans, M. H. and Flores Cervantes, I. and Fong, N. and Gamez-Djokic, M. and Glenz, A. and Gordon-McKeon, S. and Heaton, T. J. and Hederos, K. and Heene, M. and Hofelich Mohr, A. J. and H\"ogden, F. and Hui, K. and Johannesson, M. and Kalodimos, J. and Kaszubowski, E. and Kennedy, D. M. and Lei, R. and Lindsay, T. A. and Liverani, S. and Madan, C. R. and Molden, D. and Molleman, E. and Morey, R. D. and Mulder, L. B. and Nijstad, B. R. and Pope, N. G. and Pope, B. and Prenoveau, J. M. and Rink, F. and Robusto, E. and Roderique, H. and Sandberg, A. and Schl\"uter, E. and Sch\"onbrodt, F. D. and Sherman, M. F. and Sommer, S. A. and Sotak, K. and Spain, S. and Sp\"orlein, C. and Stafford, T. and Stefanutti, L. and Tauber, S. and Ullrich, J. and Vianello, M. and Wagenmakers, E.-J. and Witkowiak, M. and Yoon, S. and Nosek, B. A.},
  date = {2018-09-23},
  doi = {10/gd2429},
  file = {/Users/ryan/Documents/Zotero Library/Silberzahn et al. - 2018 - Many Analysts, One Data Set Making Transparent Ho.pdf},
  issn = {2515-2459},
  journaltitle = {Advances in Methods and Practices in Psychological Science},
  number = {3},
  pages = {337-356},
  title = {Many {{Analysts}}, {{One Data Set}}: {{Making Transparent How Variations}} in {{Analytic Choices Affect Results}}},
  volume = {1}
}

@article{Simes1986,
  abstract = {A modification of the Bonferroni procedure for testing multiple hypotheses is presented. The method, based on the ordered p-values of the individual tests, is less conservative than the classical Bonferroni procedure but is still simple to apply. A simulation study shows that the probability of a type I error of the procedure does not exceed the nominal significance level, a, for a variety of multivariate normal and multivariate gamma test statistics. For independent tests the procedure has type I error probability equal to a. The method appears particularly advantageous over the classical Bonferroni procedure when several highly-correlated test statistics are involved.},
  author = {Simes, R. J.},
  date = {1986},
  doi = {10/cpk5v6},
  file = {/Users/ryan/Documents/Zotero Library/Simes - 1986 - An improved bonferroni procedure for multiple test.pdf},
  isbn = {00063444},
  issn = {00063444},
  journaltitle = {Biometrika},
  keywords = {Bonferroni inequality,Multiple comparisons,Simultaneous test procedures},
  number = {3},
  pages = {751-754},
  title = {An Improved Bonferroni Procedure for Multiple Tests of Significance},
  volume = {73}
}

@article{Simmons2011,
  abstract = {In this article, we accomplish two things. First, we show that despite empirical psychologists' nominal endorsement of a low rate of false-positive findings ({$\leq$}.05), flexibility in data collection, analysis, and reporting dramatically increases actual false-positive rates. In many cases, a researcher is more likely to falsely find evidence that an effect exists than to correctly find evidence that it does not. We present computer simulations and a pair of actual experiments that demonstrate how unacceptably easy it is to accumulate (and report) statistically significant evidence for a false hypothesis. Second, we suggest a simple, low-cost, and straightforwardly effective disclosure-based solution to this problem. The solution involves six concrete requirements for authors and four guidelines for reviewers, all of which impose a minimal burden on the publication process. \textcopyright{} The Author(s) 2011.},
  author = {Simmons, Joseph P. and Nelson, Leif D. and Simonsohn, Uri},
  date = {2011-11-17},
  doi = {10/bxbw3c},
  eprint = {22006061},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Simmons et al. - 2011 - False-positive psychology Undisclosed flexibility.pdf},
  issn = {14679280},
  journaltitle = {Psychological Science},
  keywords = {disclosure,methodology,motivated reasoning,publication},
  number = {11},
  pages = {1359-1366},
  title = {False-Positive Psychology: {{Undisclosed}} Flexibility in Data Collection and Analysis Allows Presenting Anything as Significant},
  volume = {22}
}

@article{Sing2005,
  author = {Sing, T. and Sander, O. and Beerenwinkel, N. and Lengauer, T.},
  date = {2005-08-11},
  doi = {10/dh5bq6},
  file = {/Users/ryan/Documents/Zotero Library/Sing et al. - 2005 - ROCR visualizing classifier performance in R.pdf},
  issn = {1367-4803},
  journaltitle = {Bioinformatics},
  number = {20},
  pages = {3940-3941},
  title = {{{ROCR}}: Visualizing Classifier Performance in {{R}}},
  volume = {21}
}

@article{Singh2011,
  abstract = {In eukaryotic cells, alternative splicing expands the diversity of RNA transcripts and plays an important role in tissue-specific differentiation, and can be misregulated in disease. To understand these processes, there is a great need for methods to detect differential transcription between samples. Our focus is on samples observed using short-read RNA sequencing (RNA-seq).},
  author = {Singh, Darshan and Orellana, Christian F and Hu, Yin and Jones, Corbin D and Liu, Yufeng and Chiang, Derek Y and Liu, Jinze and Prins, Jan F},
  date = {2011-10-01},
  doi = {10/cm7zjz},
  eprint = {21824971},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Singh et al. - 2011 - FDM a graph-based statistical method to detect di.pdf},
  issn = {1367-4811},
  journaltitle = {Bioinformatics (Oxford, England)},
  keywords = {Alternative Splicing,Gene Expression Profiling,Gene Expression Profiling: methods,Genome,Humans,Models; Genetic,Protein Isoforms,Protein Isoforms: genetics,RNA,RNA: genetics,Sequence Analysis; RNA,Sequence Analysis; RNA: methods,Transcription; Genetic,Transcriptome,Transcriptome: genetics},
  number = {19},
  pages = {2633-40},
  title = {{{FDM}}: A Graph-Based Statistical Method to Detect Differential Transcription Using {{RNA}}-Seq Data.},
  volume = {27}
}

@article{Smilde2005,
  abstract = {MOTIVATION: Datasets resulting from metabolomics or metabolic profiling experiments are becoming increasingly complex. Such datasets may contain underlying factors, such as time (time-resolved or longitudinal measurements), doses or combinations thereof. Currently used biostatistics methods do not take the structure of such complex datasets into account. However, incorporating this structure into the data analysis is important for understanding the biological information in these datasets. RESULTS: We describe ASCA, a new method that can deal with complex multivariate datasets containing an underlying experimental design, such as metabolomics datasets. It is a direct generalization of analysis of variance (ANOVA) for univariate data to the multivariate case. The method allows for easy interpretation of the variation induced by the different factors of the design. The method is illustrated with a dataset from a metabolomics experiment with time and dose factors.},
  author = {Smilde, Age K and Jansen, Jeroen J and Hoefsloot, Huub C J and Lamers, Robert-Jan a N and van der Greef, Jan and Timmerman, Marieke E},
  date = {2005-07-01},
  doi = {10/frmpm9},
  eprint = {15890747},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Smilde et al. - 2005 - ANOVA-simultaneous component analysis (ASCA) a ne.pdf},
  issn = {1367-4803},
  journaltitle = {Bioinformatics (Oxford, England)},
  keywords = {Algorithms,Analysis of Variance,Animals,Ascorbic Acid,Ascorbic Acid: therapeutic use,Biological Markers,Biological Markers: urine,Computer Simulation,Dose-Response Relationship; Drug,Energy Metabolism,Gene Expression Profiling,Gene Expression Profiling: methods,Guinea Pigs,Male,Models; Biological,Models; Statistical,Multivariate Analysis,Osteoarthritis,Osteoarthritis: diagnosis,Osteoarthritis: drug therapy,Osteoarthritis: urine,Proteome,Proteome: metabolism,Software,Treatment Outcome},
  number = {13},
  options = {useprefix=true},
  pages = {3043-8},
  title = {{{ANOVA}}-Simultaneous Component Analysis ({{ASCA}}): A New Tool for Analyzing Designed Metabolomics Data.},
  volume = {21}
}

@article{Smith2012,
  abstract = {Fifty per cent of the genome is discontinuously replicated on the lagging strand as Okazaki fragments. Eukaryotic Okazaki fragments remain poorly characterized and, because nucleosomes are rapidly deposited on nascent DNA, Okazaki fragment processing and nucleosome assembly potentially affect one another. Here we show that ligation-competent Okazaki fragments in Saccharomyces cerevisiae are sized according to the nucleosome repeat. Using deep sequencing, we demonstrate that ligation junctions preferentially occur near nucleosome midpoints rather than in internucleosomal linker regions. Disrupting chromatin assembly or lagging-strand polymerase processivity affects both the size and the distribution of Okazaki fragments, suggesting a role for nascent chromatin, assembled immediately after the passage of the replication fork, in the termination of Okazaki fragment synthesis. Our studies represent the first high-resolution analysis--to our knowledge--of eukaryotic Okazaki fragments in vivo, and reveal the interconnection between lagging-strand synthesis and chromatin assembly.},
  author = {Smith, Duncan J and Whitehouse, Iestyn},
  date = {2012-03-22},
  doi = {10/f4msqp},
  eprint = {22419157},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Smith and Whitehouse - 2012 - Intrinsic coupling of lagging-strand synthesis to .pdf},
  issn = {1476-4687},
  journaltitle = {Nature},
  keywords = {Chromatin Assembly and Disassembly,Chromatin Assembly and Disassembly: physiology,DNA,DNA Ligases,DNA Ligases: deficiency,DNA Ligases: metabolism,DNA Polymerase III,DNA Polymerase III: metabolism,DNA Replication,DNA-Binding Proteins,DNA-Binding Proteins: metabolism,DNA: biosynthesis,DNA: genetics,DNA: metabolism,High-Throughput Nucleotide Sequencing,Nucleosomes,Nucleosomes: genetics,Nucleosomes: metabolism,Protein Binding,Saccharomyces cerevisiae,Saccharomyces cerevisiae Proteins,Saccharomyces cerevisiae Proteins: metabolism,Saccharomyces cerevisiae: enzymology,Saccharomyces cerevisiae: genetics,Transcription Factors,Transcription Factors: metabolism},
  number = {7390},
  pages = {434-8},
  title = {Intrinsic Coupling of Lagging-Strand Synthesis to Chromatin Assembly.},
  volume = {483}
}

@article{Smyth1999,
  abstract = {This paper considers double generalized linear models, which allow the mean and dispersion to be modelled simultaneously in a generalized linear model context. Estimation of the dispersion parameters is based on a {$\chi$} 21approximation to the unit deviances, and the accuracy of the saddle-point approximation which underlies this is discussed. Approximate REML methods are developed for estimation of the dispersion. The approximate REML methods can be implemented with very little added complication in a generalized linear model setting by adjusting the working vector and working weights. S-Plus functions for double generalized linear models are described. Through two data examples it is shown that the approximate REML methods are more robust than maximum likelihood, in the sense of being less sensitive to perturbations in the mean model. Copyright \textcopyright{} 1999 John Wiley \& Sons, Ltd.},
  author = {Smyth, Gordon K. and Verbyla, Ar\=unas P.},
  date = {1999},
  doi = {10/c6x63m},
  file = {/Users/ryan/Documents/Zotero Library/Smyth and Verbyla - 1999 - Adjusted likelihood methods for modelling dispersi.pdf},
  issn = {1099-095X},
  journaltitle = {Environmetrics},
  keywords = {adjusted pro,dispersion modelling,generalized linear models,le,reml,slippage models},
  number = {6},
  pages = {695--709},
  title = {Adjusted Likelihood Methods for Modelling Dispersion in Generalized Linear Models},
  volume = {10}
}

@article{Smyth2004,
  abstract = {The problem of identifying differentially expressed genes in designed microarray experiments is considered. Lonnstedt and Speed (2002) derived an expression for the posterior odds of differential expression in a replicated two-color experiment using a simple hierarchical parametric model. The purpose of this paper is to develop the hierarchical model of Lonnstedt and Speed (2002) into a practical approach for general microarray experiments with arbitrary numbers of treatments and RNA samples. The model is reset in the context of general linear models with arbitrary coefficients and contrasts of interest. The approach applies equally well to both single channel and two color microarray experiments. Consistent, closed form estimators are derived for the hyperparameters in the model. The estimators proposed have robust behavior even for small numbers of arrays and allow for incomplete data arising from spot filtering or spot quality weights. The posterior odds statistic is reformulated in terms of a moderated t-statistic in which posterior residual standard deviations are used in place of ordinary standard deviations. The empirical Bayes approach is equivalent to shrinkage of the estimated sample variances towards a pooled estimate, resulting in far more stable inference when the number of arrays is small. The use of moderated t-statistics has the advantage over the posterior odds that the number of hyperparameters which need to estimated is reduced; in particular, knowledge of the non-null prior for the fold changes are not required. The moderated t-statistic is shown to follow a t-distribution with augmented degrees of freedom. The moderated t inferential approach extends to accommodate tests of composite null hypotheses through the use of moderated F-statistics. The performance of the methods is demonstrated in a simulation study. Results are presented for two publicly available data sets.},
  author = {Smyth, Gordon K},
  date = {2004-01-12},
  doi = {10/ddqzg9},
  eprint = {16646809},
  eprinttype = {pmid},
  issn = {1544-6115},
  journaltitle = {Statistical Applications in Genetics and Molecular Biology},
  keywords = {\#nosource,bayes,microarray,moderated t-statistic},
  number = {1},
  pages = {1-25},
  title = {Linear {{Models}} and {{Empirical Bayes Methods}} for {{Assessing Differential Expression}} in {{Microarray Experiments}}},
  volume = {3}
}

@article{Smyth2005,
  abstract = {A survey is given of differential expression analyses using the linear modeling features of the limma package. The chapter starts with the simplest replicated designs and progresses through experiments with two or more groups, direct designs, factorial designs and time course experiments. Experiments with technical as well as biological replication are considered. Empirical Bayes test statistics are explained. The use of quality weights, adaptive background correction and control spots in conjunction with linear modelling is illustrated on the {$\beta$}7 data.},
  author = {Smyth, Gordon K},
  date = {2005},
  doi = {10/dv8chk},
  eprint = {16495579},
  eprinttype = {pmid},
  isbn = {9780387251462},
  issn = {00199567},
  journaltitle = {Bioinformatics},
  keywords = {\#nosource,limma,microarray},
  pages = {397-420},
  title = {Limma : {{Linear Models}} for {{Microarray Data}}},
  volume = {pages}
}

@article{Smyth2005a,
  abstract = {MOTIVATION: Spotted arrays are often printed with probes in duplicate or triplicate, but current methods for assessing differential expression are not able to make full use of the resulting information. The usual practice is to average the duplicate or triplicate results for each probe before assessing differential expression. This results in the loss of valuable information about genewise variability.

RESULTS: A method is proposed for extracting more information from within-array replicate spots in microarray experiments by estimating the strength of the correlation between them. The method involves fitting separate linear models to the expression data for each gene but with a common value for the between-replicate correlation. The method greatly improves the precision with which the genewise variances are estimated and thereby improves inference methods designed to identify differentially expressed genes. The method may be combined with empirical Bayes methods for moderating the genewise variances between genes. The method is validated using data from a microarray experiment involving calibration and ratio control spots in conjunction with spiked-in RNA. Comparing results for calibration and ratio control spots shows that the common correlation method results in substantially better discrimination of differentially expressed genes from those which are not. The spike-in experiment also confirms that the results may be further improved by empirical Bayes smoothing of the variances when the sample size is small.

AVAILABILITY: The methodology is implemented in the limma software package for R, available from the CRAN repository http://www.r-project.org},
  author = {Smyth, Gordon K and Michaud, Jo\"elle and Scott, Hamish S},
  date = {2005-05-01},
  doi = {10/ffmjnv},
  eprint = {15657102},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Smyth et al. - 2005 - Use of within-array replicate spots for assessing .pdf},
  issn = {1367-4803},
  journaltitle = {Bioinformatics (Oxford, England)},
  keywords = {Algorithms,Gene Expression Profiling,Gene Expression Profiling: methods,Genetic Variation,Genetic Variation: genetics,In Situ Hybridization; Fluorescence,In Situ Hybridization; Fluorescence: methods,Linear Models,Models; Genetic,Models; Statistical,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,RNA; Messenger,RNA; Messenger: genetics,Software},
  number = {9},
  pages = {2067-75},
  title = {Use of Within-Array Replicate Spots for Assessing Differential Expression in Microarray Experiments.},
  volume = {21}
}

@article{Smyth2013,
  abstract = {Permutation tests are amongst the most commonly used statistical tools in modern genomic research, a process by which p-values are attached to a test statistic by randomly permuting the sample or gene labels. Yet permutation p-values published in the genomic literature are often computed incorrectly, understated by about 1/m, where m is the number of permutations. The same is often true in the more general situation when Monte Carlo simulation is used to assign p-values. Although the p-value understatement is usually small in absolute terms, the implications can be serious in a multiple testing context. The understatement arises from the intuitive but mistaken idea of using permutation to estimate the tail probability of the test statistic. We argue instead that permutation should be viewed as generating an exact discrete null distribution. The relevant literature, some of which is likely to have been relatively inaccessible to the genomic community, is reviewed and summarized. A computation strategy is developed for exact p-values when permutations are randomly drawn. The strategy is valid for any number of permutations and samples. Some simple recommendations are made for the implementation of permutation tests in practice.},
  author = {Phipson, Belinda and Smyth, Gordon K},
  date = {2010-01},
  doi = {10/ckkrfx},
  eprint = {21044043},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Phipson and Smyth - 2010 - Permutation P-values should never be zero calcula.pdf},
  issn = {1544-6115},
  journaltitle = {Statistical applications in genetics and molecular biology},
  keywords = {Gene Expression Profiling,Gene Expression Profiling: methods,Genomics,Genomics: methods,Models,Probability,Statistical},
  pages = {Article39},
  title = {Permutation {{P}}-Values Should Never Be Zero: Calculating Exact {{P}}-Values When Permutations Are Randomly Drawn.},
  volume = {9}
}

@article{Soneson2013,
  abstract = {BACKGROUND: Finding genes that are differentially expressed between conditions is an integral part of understanding the molecular basis of phenotypic variation. In the past decades, DNA microarrays have been used extensively to quantify the abundance of mRNA corresponding to different genes, and more recently high-throughput sequencing of cDNA (RNA-seq) has emerged as a powerful competitor. As the cost of sequencing decreases, it is conceivable that the use of RNA-seq for differential expression analysis will increase rapidly. To exploit the possibilities and address the challenges posed by this relatively new type of data, a number of software packages have been developed especially for differential expression analysis of RNA-seq data.

RESULTS: We conducted an extensive comparison of eleven methods for differential expression analysis of RNA-seq data. All methods are freely available within the R framework and take as input a matrix of counts, i.e. the number of reads mapping to each genomic feature of interest in each of a number of samples. We evaluate the methods based on both simulated data and real RNA-seq data.

CONCLUSIONS: Very small sample sizes, which are still common in RNA-seq experiments, impose problems for all evaluated methods and any results obtained under such conditions should be interpreted with caution. For larger sample sizes, the methods combining a variance-stabilizing transformation with the 'limma' method for differential expression analysis perform well under many different conditions, as does the nonparametric SAMseq method.},
  author = {Soneson, Charlotte and Delorenzi, Mauro},
  date = {2013-01},
  doi = {10/gb8v4r},
  eprint = {23497356},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Soneson and Delorenzi - 2013 - A comparison of methods for differential expressio.pdf},
  issn = {1471-2105},
  journaltitle = {BMC bioinformatics},
  keywords = {differential expression,gene expression,rna-seq},
  number = {1},
  pages = {91},
  title = {A Comparison of Methods for Differential Expression Analysis of {{RNA}}-Seq Data.},
  volume = {14}
}

@article{Soneson2015,
  abstract = {High-throughput sequencing of cDNA (RNA-seq) is used extensively to characterize the transcriptome of cells. Many transcriptomic studies aim at comparing either abundance levels or the transcriptome composition between given conditions, and as a first step, the sequencing reads must be used as the basis for abundance quantification of transcriptomic features of interest, such as genes or transcripts. Various quantification approaches have been proposed, ranging from simple counting of reads that overlap given genomic regions to more complex estimation of underlying transcript abundances. In this paper, we show that gene-level abundance estimates and statistical inference offer advantages over transcript-level analyses, in terms of performance and interpretability. We also illustrate that the presence of differential isoform usage can lead to inflated false discovery rates in differential gene expression analyses on simple count matrices but that this can be addressed by incorporating offsets derived from transcript-level abundance estimates. We also show that the problem is relatively minor in several real data sets. Finally, we provide an R package ( tximport) to help users integrate transcript-level abundance estimates from common quantification pipelines into count-based statistical inference engines.},
  author = {Soneson, Charlotte and Love, Michael I. and Robinson, Mark D.},
  date = {2015},
  doi = {10/gfs3ng},
  eprint = {26925227},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Soneson et al. - 2015 - Differential analyses for RNA-seq transcript-leve.pdf},
  issn = {2046-1402},
  journaltitle = {F1000Research},
  keywords = {gene expression,quantification,RNA-seq,transcriptomics},
  number = {0},
  pages = {1521},
  title = {Differential Analyses for {{RNA}}-Seq: Transcript-Level Estimates Improve Gene-Level Inferences.},
  volume = {4}
}

@article{Song2014,
  abstract = {PMID: 25398208},
  author = {Song, Li and Florea, Liliana and Langmead, Ben},
  date = {2014},
  doi = {10/gfkmwv},
  eprint = {25398208},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Song et al. - 2014 - Lighter fast and memory-efficient sequencing erro.pdf},
  isbn = {1465-6906},
  issn = {1465-6906},
  journaltitle = {Genome Biology},
  number = {11},
  pages = {509},
  title = {Lighter: Fast and Memory-Efficient Sequencing Error Correction without Counting},
  volume = {15}
}

@article{Spyrou2009,
  abstract = {High-throughput sequencing technology has become popular and widely used to study protein and DNA interactions. Chromatin immunoprecipitation, followed by sequencing of the resulting samples, produces large amounts of data that can be used to map genomic features such as transcription factor binding sites and histone modifications.},
  author = {Spyrou, Christiana and Stark, Rory and Lynch, Andy G and Tavar\'e, Simon},
  date = {2009-01},
  doi = {10/crjtbm},
  eprint = {19772557},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Spyrou et al. - 2009 - BayesPeak Bayesian analysis of ChIP-seq data..pdf},
  isbn = {1471210510},
  issn = {1471-2105},
  journaltitle = {BMC bioinformatics},
  keywords = {Bayes Theorem,Binding Sites,Chromatin Immunoprecipitation,Chromatin Immunoprecipitation: methods,Computational Biology,Computational Biology: methods,DNA,DNA: chemistry,DNA: metabolism,Proteins,Proteins: metabolism},
  pages = {299},
  title = {{{BayesPeak}}: {{Bayesian}} Analysis of {{ChIP}}-Seq Data.},
  volume = {10}
}

@article{Srivastava2010,
  abstract = {Deep sequencing of RNAs (RNA-seq) has been a useful tool to characterize and quantify transcriptomes. However, there are significant challenges in the analysis of RNA-seq data, such as how to separate signals from sequencing bias and how to perform reasonable normalization. Here, we focus on a fundamental question in RNA-seq analysis: the distribution of the position-level read counts. Specifically, we propose a two-parameter generalized Poisson (GP) model to the position-level read counts. We show that the GP model fits the data much better than the traditional Poisson model. Based on the GP model, we can better estimate gene or exon expression, perform a more reasonable normalization across different samples, and improve the identification of differentially expressed genes and the identification of differentially spliced exons. The usefulness of the GP model is demonstrated by applications to multiple RNA-seq data sets.},
  author = {Srivastava, Sudeep and Chen, Liang},
  date = {2010-09},
  doi = {10/c3xqjj},
  eprint = {20671027},
  eprinttype = {pmid},
  issn = {1362-4962},
  journaltitle = {Nucleic acids research},
  keywords = {\#nosource,Animals,Exons,Gene Expression Profiling,Gene Expression Profiling: methods,Humans,Mice,Models; Statistical,Poisson Distribution,RNA Splicing,Sequence Analysis; RNA,Sequence Analysis; RNA: methods},
  number = {17},
  pages = {e170},
  title = {A Two-Parameter Generalized {{Poisson}} Model to Improve the Analysis of {{RNA}}-Seq Data.},
  volume = {38}
}

@article{Srivastava2016,
  abstract = {Motivation: The alignment of sequencing reads to a transcriptome is a common and important step in many RNA-seq analysis tasks. When aligning RNA-seq reads directly to a transcriptome (as is common in the de novo setting or when a trusted reference annotation is available), care must be taken to report the potentially large number of multi-mapping locations per read. This can pose a substantial computational burden for existing aligners, and can considerably slow downstream analysis. Results: We introduce a novel concept, quasi-mapping, and an efficient algorithm implementing this approach for mapping sequencing reads to a transcriptome. By attempting only to report the potential loci of origin of a sequencing read, and not the base-to-base alignment by which it derives from the reference, RapMap - our tool implementing quasi-mapping - is capable of mapping sequencing reads to a target transcriptome substantially faster than existing alignment tools. The algorithm we use to implement quasi-mapping uses several efficient data structures and takes advantage of the special structure of shared sequence prevalent in transcriptomes to rapidly provide highly-accurate mapping information. We demonstrate how quasi-mapping can be successfully applied to the problems of transcript-level quantification from RNA-seq reads and the clustering of contigs from de novo assembled transcriptomes into biologically meaningful groups.},
  author = {Srivastava, Avi and Sarkar, Hirak and Gupta, Nitish and Patro, Rob},
  date = {2016-10-22},
  doi = {10/f8vm2j},
  eprint = {27307617},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Srivastava et al. - 2016 - RapMap a rapid, sensitive and accurate tool for m.pdf},
  issn = {1367-4803},
  journaltitle = {Bioinformatics},
  keywords = {★},
  number = {12},
  pages = {i192-i200},
  title = {{{RapMap}}: A Rapid, Sensitive and Accurate Tool for Mapping {{RNA}}-Seq Reads to Transcriptomes},
  volume = {32}
}

@article{Stadler2011,
  abstract = {Methylation of cytosines is an essential epigenetic modification in mammalian genomes, yet the rules that govern methylation patterns remain largely elusive. To gain insights into this process, we generated base-pair-resolution mouse methylomes in stem cells and neuronal progenitors. Advanced quantitative analysis identified low-methylated regions (LMRs) with an average methylation of 30\%. These represent CpG-poor distal regulatory regions as evidenced by location, DNase I hypersensitivity, presence of enhancer chromatin marks and enhancer activity in reporter assays. LMRs are occupied by DNA-binding factors and their binding is necessary and sufficient to create LMRs. A comparison of neuronal and stem-cell methylomes confirms this dependency, as cell-type-specific LMRs are occupied by cell-type-specific transcription factors. This study provides methylome references for the mouse and shows that DNA-binding factors locally influence DNA methylation, enabling the identification of active regulatory regions.},
  author = {Stadler, Michael B and Murr, Rabih and Burger, Lukas and Ivanek, Robert and Lienert, Florian and Sch\"oler, Anne and Wirbelauer, Christiane and Oakeley, Edward J and Gaidatzis, Dimos and Tiwari, Vijay K and Sch\"ubeler, Dirk},
  date = {2011-12-22},
  doi = {10/bfsz5x},
  eprint = {22170606},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Stadler et al. - 2011 - DNA-binding factors shape the mouse methylome at d.pdf},
  issn = {1476-4687},
  journaltitle = {Nature},
  keywords = {Animals,Cell Differentiation,CpG Islands,Cytosine,Cytosine: metabolism,DNA Methylation,DNA-Binding Proteins,DNA-Binding Proteins: metabolism,Embryonic Stem Cells,Embryonic Stem Cells: cytology,Epigenomics,Mice,Neurons,Neurons: cytology,Promoter Regions; Genetic,Promoter Regions; Genetic: genetics,Protein Binding,Stem Cells,Stem Cells: cytology,Transcription Factors,Transcription Factors: metabolism},
  number = {7378},
  pages = {490-5},
  title = {{{DNA}}-Binding Factors Shape the Mouse Methylome at Distal Regulatory Regions.},
  volume = {480}
}

@article{Standage2012,
  abstract = {Accurate gene structure annotation is a fundamental but somewhat elusive goal of genome projects, as witnessed by the fact that (model) genomes typically undergo several cycles of re-annotation. In many cases, it is not only different versions of annotations that need to be compared but also different sources of annotation of the same genome, derived from distinct gene prediction workflows. Such comparisons are of interest to annotation providers, prediction software developers, and end-users, who all need to assess what is common and what is different among distinct annotation sources. We developed ParsEval, a software application for pairwise comparison of sets of gene structure annotations. ParsEval calculates several statistics that highlight the similarities and differences between the two sets of annotations provided. These statistics are presented in an aggregate summary report, with additional details provided as individual reports specific to non-overlapping, gene-model-centric genomic loci. Genome browser styled graphics embedded in these reports help visualize the genomic context of the annotations. Output from ParsEval is both easily read and parsed, enabling systematic identification of problematic gene models for subsequent focused analysis.},
  author = {Standage, Daniel S and Brendel, Volker P},
  date = {2012-01},
  doi = {10/f4cb92},
  eprint = {22852583},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Standage and Brendel - 2012 - ParsEval parallel comparison and analysis of gene.pdf},
  issn = {1471-2105},
  journaltitle = {BMC bioinformatics},
  pages = {187},
  title = {{{ParsEval}}: Parallel Comparison and Analysis of Gene Structure Annotations.},
  volume = {13}
}

@article{Stanke2003,
  author = {Stanke, M. and Waack, S.},
  date = {2003-10-08},
  doi = {10/chr4sf},
  file = {/Users/ryan/Documents/Zotero Library/Stanke and Waack - 2003 - Gene prediction with a hidden Markov model and a n.pdf},
  issn = {1367-4803},
  issue = {Suppl 2},
  journaltitle = {Bioinformatics},
  pages = {ii215-ii225},
  title = {Gene Prediction with a Hidden {{Markov}} Model and a New Intron Submodel},
  volume = {19}
}

@article{Stanke2006,
  abstract = {In order to improve gene prediction, extrinsic evidence on the gene structure can be collected from various sources of information such as genome-genome comparisons and EST and protein alignments. However, such evidence is often incomplete and usually uncertain. The extrinsic evidence is usually not sufficient to recover the complete gene structure of all genes completely and the available evidence is often unreliable. Therefore extrinsic evidence is most valuable when it is balanced with sequence-intrinsic evidence.},
  author = {Stanke, Mario and Sch\"offmann, Oliver and Morgenstern, Burkhard and Waack, Stephan},
  date = {2006-01},
  doi = {10/cv8xsn},
  eprint = {16469098},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Stanke et al. - 2006 - Gene prediction in eukaryotes with a generalized h.pdf},
  issn = {1471-2105},
  journaltitle = {BMC bioinformatics},
  keywords = {Algorithms,Animals,Artificial Intelligence,Base Sequence,Chromosome Mapping,Chromosome Mapping: methods,Computer Simulation,Databases; Genetic,Genetic Variation,Genetic Variation: genetics,Humans,Information Storage and Retrieval,Information Storage and Retrieval: methods,Markov Chains,Models; Genetic,Models; Statistical,Molecular Sequence Data,Pattern Recognition; Automated,Sequence Alignment,Sequence Alignment: methods,Sequence Analysis; DNA,Sequence Analysis; DNA: methods,Stochastic Processes},
  pages = {62},
  title = {Gene Prediction in Eukaryotes with a Generalized Hidden {{Markov}} Model That Uses Hints from External Sources.},
  volume = {7}
}

@article{Stanke2008,
  abstract = {Computational annotation of protein coding genes in genomic DNA is a widely used and essential tool for analyzing newly sequenced genomes. However, current methods suffer from inaccuracy and do poorly with certain types of genes. Including additional sources of evidence of the existence and structure of genes can improve the quality of gene predictions. For many eukaryotic genomes, expressed sequence tags (ESTs) are available as evidence for genes. Related genomes that have been sequenced, annotated, and aligned to the target genome provide evidence of existence and structure of genes.},
  author = {Stanke, Mario and Diekhans, Mark and Baertsch, Robert and Haussler, David},
  date = {2008-03-01},
  doi = {10/fjbrdc},
  eprint = {18218656},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Stanke et al. - 2008 - Using native and syntenically mapped cDNA alignmen.pdf},
  issn = {1367-4811},
  journaltitle = {Bioinformatics (Oxford, England)},
  keywords = {Alternative Splicing,Animals,DNA; Complementary,DNA; Complementary: genetics,Expressed Sequence Tags,Humans,Sequence Alignment},
  number = {5},
  pages = {637-44},
  title = {Using Native and Syntenically Mapped {{cDNA}} Alignments to Improve de Novo Gene Finding.},
  volume = {24}
}

@article{Statham2010,
  abstract = {Summary: Epigenetics, the study of heritable somatic phenotypic changes not related to DNA sequence, has emerged as a critical component of the landscape of gene regulation. The epigenetic layers, such as DNA methylation, histone modifications and nuclear architecture are now being extensively studied in many cell types and disease settings. Few software tools exist to summarize and interpret these datasets. We have created a toolbox of procedures to interrogate and visualize epigenomic data (both array- and sequencing-based) and make available a software package for the cross-platform R language. Availability: The package is freely available under LGPL from the R-Forge web site (http://repitools.r-forge.r-project.org/). Contact: mrobinson@wehi.edu.au. \textcopyright{} The Author(s) 2010. Published by Oxford University Press.},
  author = {Statham, Aaron L. and Strbenac, Dario and Coolen, Marcel W. and Stirzaker, Clare and Clark, Susan J. and Robinson, Mark D.},
  date = {2010-07-01},
  doi = {10/cnpv85},
  eprint = {20457667},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Statham et al. - 2010 - Repitools An R package for the analysis of enrich.pdf},
  isbn = {1367-4811 (Electronic)\textbackslash{}r1367-4803 (Linking)},
  issn = {13674803},
  journaltitle = {Bioinformatics},
  number = {13},
  pages = {1662-1663},
  title = {Repitools: {{An R}} Package for the Analysis of Enrichment-Based Epigenomic Data},
  volume = {26}
}

@article{Stegle2010,
  abstract = {As a fruit of the current revolution in sequencing technology, transcriptomes can now be analyzed at an unprecedented level of detail. These advances have been exploited for detecting differential expressed genes across biological samples and for quantifying the abundances of various RNA transcripts within one gene. However, explicit strategies for detecting the hidden differential abundances of RNA transcripts in biological samples have not been defined. In this work, we present two novel statistical tests to address this issue: a 'gene structure sensitive' Poisson test for detecting differential expression when the transcript structure of the gene is known, and a kernel-based test called Maximum Mean Discrepancy when it is unknown. We analyzed the proposed approaches on simulated read data for two artificial samples as well as on factual reads generated by the Illumina Genome Analyzer for two C. elegans samples. Our analysis shows that the Poisson test identifies genes with differential transcript expression considerably better that previously proposed RNA transcript quantification approaches for this task. The MMD test is able to detect a large fraction (75\%) of such differential cases without the knowledge of the annotated transcripts. It is therefore well-suited to analyze RNA-Seq experiments when the genome annotations are incomplete or not available, where other approaches have to fail.},
  author = {Stegle, Oliver and Drewe, Philipp and Bohnert, Regina and Borgwardt, Karsten and R\"atsch, Gunnar},
  date = {2010-05-11},
  doi = {10/fjzshc},
  file = {/Users/ryan/Documents/Zotero Library/Stegle et al. - 2010 - Statistical Tests for Detecting Differential RNA-T.pdf},
  issn = {1756-0357},
  journaltitle = {Nature Precedings},
  langid = {english},
  note = {Citation Key Alias: Stegle2010, stegleStatisticalTestsDetecting2010},
  shortjournal = {Nat Prec},
  title = {Statistical {{Tests}} for {{Detecting Differential RNA}}-{{Transcript Expression}} from {{Read Counts}}}
}

@article{Stegle2010a,
  author = {Stegle, Oliver and Parts, Leopold and Durbin, Richard and Winn, John},
  date = {2010},
  doi = {10/bzfx58},
  file = {/Users/ryan/Documents/Zotero Library/Stegle et al. - 2010 - A Bayesian Framework to Account for Complex Non-Ge.pdf;/Users/ryan/Documents/Zotero Library/Stegle et al. - 2010 - A Bayesian Framework to Account for Complex Non-Ge2.pdf},
  issn = {1553-7358},
  journaltitle = {PLoS Computational Biology},
  number = {5},
  pages = {e1000770},
  title = {A {{Bayesian Framework}} to {{Account}} for {{Complex Non}}-{{Genetic Factors}} in {{Gene Expression Levels Greatly Increases Power}} in {{eQTL Studies}}},
  volume = {6}
}

@article{Stegle2012,
  abstract = {We present PEER (probabilistic estimation of expression residuals), a software package implementing statistical models that improve the sensitivity and interpretability of genetic associations in population-scale expression data. This approach builds on factor analysis methods that infer broad variance components in the measurements. PEER takes as input transcript profiles and covariates from a set of individuals, and then outputs hidden factors that explain much of the expression variability. Optionally, these factors can be interpreted as pathway or transcription factor activations by providing prior information about which genes are involved in the pathway or targeted by the factor. The inferred factors are used in genetic association analyses. First, they are treated as additional covariates, and are included in the model to increase detection power for mapping expression traits. Second, they are analyzed as phenotypes themselves to understand the causes of global expression variability. PEER extends previous related surrogate variable models and can be implemented within hours on a desktop computer.},
  author = {Stegle, Oliver and Parts, Leopold and Piipari, Matias and Winn, John and Durbin, Richard},
  date = {2012-03},
  doi = {10/ggcxmr},
  eprint = {22343431},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Stegle et al. - 2012 - Using probabilistic estimation of expression resid.pdf},
  issn = {1750-2799},
  journaltitle = {Nature protocols},
  keywords = {Algorithms,Factor Analysis; Statistical,Gene Expression Profiling,Gene Expression Profiling: methods,Gene Expression Profiling: statistics & numerical,Genetic Association Studies,Genetic Association Studies: methods,Models; Statistical,Sensitivity and Specificity,Software},
  number = {3},
  pages = {500-7},
  title = {Using Probabilistic Estimation of Expression Residuals ({{PEER}}) to Obtain Increased Power and Interpretability of Gene Expression Analyses.},
  volume = {7}
}

@article{Steijger2013,
  abstract = {We evaluated 25 protocol variants of 14 independent computational methods for exon identification, transcript reconstruction and expression-level quantification from RNA-seq data. Our results show that most algorithms are able to identify discrete transcript components with high success rates but that assembly of complete isoform structures poses a major challenge even when all constituent elements are identified. Expression-level estimates also varied widely across methods, even when based on similar transcript models. Consequently, the complexity of higher eukaryotic genomes imposes severe limitations on transcript recall and splice product discrimination that are likely to remain limiting factors for the analysis of current-generation RNA-seq data.},
  author = {Steijger, Tamara and Abril, Josep F and Engstr\"om, P\"ar G and Kokocinski, Felix and Akerman, Martin and Alioto, Tyler and Ambrosini, Giovanna and Antonarakis, Stylianos E and Behr, Jonas and Bertone, Paul and Bohnert, Regina and Bucher, Philipp and Cloonan, Nicole and Derrien, Thomas and Djebali, Sarah and Du, Jiang and Dudoit, Sandrine and Gerstein, Mark and Gingeras, Thomas R and Gonzalez, David and Grimmond, Sean M and Guig\'o, Roderic and Habegger, Lukas and Harrow, Jennifer and Hubbard, Tim J and Iseli, Christian and Jean, G\'eraldine and Kahles, Andr\'e and Lagarde, Julien and Leng, Jing and Lefebvre, Gregory and Lewis, Suzanna and Mortazavi, Ali and Niermann, Peter and R\"atsch, Gunnar and Reymond, Alexandre and Ribeca, Paolo and Richard, Hugues and Rougemont, Jacques and Rozowsky, Joel and Sammeth, Michael and Sboner, Andrea and Schulz, Marcel H and Searle, Steven M J and Solorzano, Naryttza Diaz and Solovyev, Victor and Stanke, Mario and Stevenson, Brian J and Stockinger, Heinz and Valsesia, Armand and Weese, David and White, Simon and Wold, Barbara J and Wu, Jie and Wu, Thomas D and Zeller, Georg and Zerbino, Daniel and Zhang, Michael Q},
  date = {2013-11-03},
  doi = {10/pt8},
  eprint = {24185837},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Steijger et al. - 2013 - Assessment of transcript reconstruction methods fo.pdf},
  issn = {1548-7105},
  issue = {november},
  journaltitle = {Nature methods},
  pages = {7-9},
  title = {Assessment of Transcript Reconstruction Methods for {{RNA}}-Seq.}
}

@article{Stephens2016b,
  author = {Stephens, Matthew},
  date = {2016},
  file = {/Users/ryan/Documents/Zotero Library/Stephens - 2016 - False Discovery Rates ( FDRs ) A new deal.pdf},
  pages = {1-12},
  title = {False {{Discovery Rates}} ( {{FDRs}} ) {{A}} New Deal}
}

@article{Stephens2017,
  abstract = {We introduce a new Empirical Bayes approach for large-scale hypothesis testing, including estimating false discovery rates (FDRs), and effect sizes. This approach has two key differences from existing approaches to FDR analysis. First, it assumes that the distribution of the actual (unobserved) effects is unimodal, with a mode at 0. This "unimodal assumption" (UA), although natural in many contexts, is not usually incorporated into standard FDR analysis, and we demonstrate how incorporating it brings many benefits. Specifically, the UA facilitates efficient and robust computation-estimating the unimodal distribution involves solving a simple convex optimization problem-and enables more accurate inferences provided that it holds. Second, the method takes as its input two numbers for each test (an effect size estimate and corresponding standard error), rather than the one number usually used (\$p\$ value or \$z\$ score). When available, using two numbers instead of one helps account for variation in measurement precision across tests. It also facilitates estimation of effects, and unlike standard FDR methods, our approach provides interval estimates (credible regions) for each effect in addition to measures of significance. To provide a bridge between interval estimates and significance measures, we introduce the term "local false sign rate" to refer to the probability of getting the sign of an effect wrong and argue that it is a superior measure of significance than the local FDR because it is both more generally applicable and can be more robustly estimated. Our methods are implemented in an R package ashr available from http://github.com/stephens999/ashr.},
  author = {Stephens, Matthew},
  date = {2016-10-17},
  doi = {10/ggcxms},
  eprint = {27756721},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Stephens - 2016 - False discovery rates a new deal.pdf},
  issn = {1465-4644},
  journaltitle = {Biostatistics},
  keywords = {Empirical Bayes,False discovery rates,Multiple testing,Shrinkage,Unimodal},
  number = {2},
  pages = {kxw041},
  title = {False Discovery Rates: A New Deal},
  volume = {18}
}

@article{Storey2002,
  abstract = {Multiple-hypothesis testing involves guarding against much more complicated errors than single-hypothesis testing. Whereas we typically control the type I error rate for a single-hypothesis test, a compound error rate is controlled for multiple-hypothesis tests. For example, controlling the false discovery rate FDR traditionally involves intricate sequential p-value rejection methods based on the observed data. Whereas a sequential p-value method fixes the error rate and estimates its corresponding rejection region, we propose the opposite approach\textemdash{}we fix the rejection region and then estimate its corresponding error rate. This new approach offers increased applicability, accuracy and power. We apply the methodology to both the positive false discovery rate pFDR and FDR, and provide evidence for its benefits. It is shown that pFDR is probably the quantity of interest over FDR. Also discussed is the calculation of the q-value, the pFDR analogue of the p-value, which eliminates the need to set the error rate beforehand as is traditionally done. Some simple numerical examples are presented that show that this new approach can yield an increase of over eight times in power compared with the Benjamini\textendash{}Hochberg FDR method.},
  author = {Storey, John D.},
  date = {2002-08},
  doi = {10/dfj3qx},
  eprint = {11917092},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Storey - 2002 - A direct approach to false discovery rates.pdf},
  isbn = {9781405122382},
  issn = {13697412},
  journaltitle = {Journal of the Royal Statistical Society: Series B (Statistical Methodology)},
  keywords = {false discovery rate,multiple comparisons,p -values,positive false discovery rate,q -values,sequential p -value methods,simultaneous inference},
  number = {3},
  pages = {479-498},
  title = {A Direct Approach to False Discovery Rates},
  volume = {64}
}

@article{Storey2003,
  abstract = {Multiple hypothesis testing is concerned with controlling the rate of false positives when testing several hypotheses simultaneously. One multiple hypothesis testing error measure is the false discovery rate (FDR), which is loosely defined to be the expected proportion of false positives among all significant hypotheses. The FDR is especially appropriate for exploratory analyses in which one is interested in finding several significant results among many tests. In this work, we introduce a modified version of the FDR called the " positive false discovery rate " (pFDR). We discuss the advantages and disadvantages of the pFDR and investigate its statistical properties. When assuming the test statistics follow a mixture distribution, we show that the pFDR can be written as a Bayesian posterior probability and can be connected to classification theory. These properties remain asymptotically true under fairly general conditions, even under certain forms of dependence. Also, a new quantity called the " q-value " is introduced and investigated, which is a natural " Bayesian posterior p-value, " or rather the pFDR analogue of the p-value.},
  author = {Storey, John D.},
  date = {2003},
  doi = {10/fcrd4d},
  eprint = {3448445},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Storey - 2003 - The positive false discovery rate A Bayesian inte.pdf},
  isbn = {0090-5364},
  issn = {00905364},
  journaltitle = {Annals of Statistics},
  keywords = {Multiple comparisons,p-values,pFDR,pFNR,q-values,Simultaneous inference},
  number = {6},
  pages = {2013-2035},
  title = {The Positive False Discovery Rate: {{A Bayesian}} Interpretation and the q-Value},
  volume = {31}
}

@article{Storey2007,
  author = {Storey, John D.},
  date = {2007-06},
  doi = {10/bp5kcc},
  file = {/Users/ryan/Documents/Zotero Library/Storey - 2007 - The optimal discovery procedure a new approach to.pdf},
  issn = {1369-7412},
  journaltitle = {Journal of the Royal Statistical Society: Series B (Statistical Methodology)},
  keywords = {classification,false discovery rate,multiple-hypothesis testing,optimal discovery,procedure,q -value,single-thresholding procedure},
  number = {3},
  pages = {347-368},
  title = {The Optimal Discovery Procedure: A New Approach to Simultaneous Significance Testing},
  volume = {69}
}

@article{Storey2017,
  author = {Storey, John D},
  date = {2017},
  file = {/Users/ryan/Documents/Zotero Library/Storey - 2017 - The Functional False Discovery Rate with Applicati.pdf},
  pages = {1-27},
  title = {The {{Functional False Discovery Rate}} with {{Applications}} in {{Genomics}}}
}

@article{Stratmann2011,
  abstract = {Interactions of transcription factors with chromatin are highly dynamic. Now Voss et al. (2011) demonstrate that two transcription factors with identical DNA-binding specificities do not compete for occupancy at a given DNA element, but instead, one factor can even facilitate the binding of another. This assisted loading probably involves chromatin-remodeling machines.},
  author = {Stratmann, Markus and Schibler, Ueli},
  date = {2011-08-19},
  doi = {10/fbr7rr},
  eprint = {21854974},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Stratmann and Schibler - 2011 - Transcription factor loading please take my place.pdf},
  issn = {1097-4172},
  journaltitle = {Cell},
  number = {4},
  pages = {497-9},
  title = {Transcription Factor Loading: Please Take My Place!},
  volume = {146}
}

@article{Strimmer2008,
  abstract = {Background: False discovery rate (FDR) methods play an important role in analyzing high-dimensional data. There are two types of FDR, tail area-based FDR and local FDR, as well as numerous statistical algorithms for estimating or controlling FDR. These differ in terms of underlying test statistics and procedures employed for statistical learning. Results: A unifying algorithm for simultaneous estimation of both local FDR and tail area-based FDR is presented that can be applied to a diverse range of test statistics, including p-values, correlations, z- and t-scores. This approach is semipararametric and is based on a modified Grenander density estimator. For test statistics other than p-values it allows for empirical null modeling, so that dependencies among tests can be taken into account. The inference of the underlying model employs truncated maximum-likelihood estimation, with the cut-off point chosen according to the false non-discovery rate. Conclusion: The proposed procedure generalizes a number of more specialized algorithms and thus offers a common framework for FDR estimation consistent across test statistics and types of FDR. In comparative study the unified approach performs on par with the best competing yet more specialized alternatives. The algorithm is implemented in R in the "fdrtool" package, available under the GNU GPL from http://strimmerlab.org/software/fdrtool/ and from the R package archive CRAN. \textcopyright{} 2008 Strimmer; licensee BioMed Central Ltd.},
  author = {Strimmer, Korbinian},
  date = {2008-01},
  doi = {10/fksn5x},
  eprint = {18613966},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Strimmer - 2008 - A unified approach to false discovery rate estimat.pdf},
  isbn = {ISSN\textasciitilde\textasciitilde{}1471-2105},
  issn = {14712105},
  journaltitle = {BMC Bioinformatics},
  keywords = {Algorithms,Biometry,Biometry: methods,Breast Neoplasms,Breast Neoplasms: genetics,Confidence Intervals,Female,HIV,HIV: genetics,Humans,Likelihood Functions,Models,Oligonucleotide Array Sequence Analysis,Predictive Value of Tests,Sample Size,Software,Statistical},
  number = {1},
  pages = {303},
  title = {A Unified Approach to False Discovery Rate Estimation},
  volume = {9}
}

@article{Strimmer2008a,
  abstract = {SUMMARY: False discovery rate (FDR) methodologies are essential in the study of high-dimensional genomic and proteomic data. The R package 'fdrtool' facilitates such analyses by offering a comprehensive set of procedures for FDR estimation. Its distinctive features include: (i) many different types of test statistics are allowed as input data, such as P-values, z-scores, correlations and t-scores; (ii) simultaneously, both local FDR and tail area-based FDR values are estimated for all test statistics and (iii) empirical null models are fit where possible, thereby taking account of potential over- or underdispersion of the theoretical null. In addition, 'fdrtool' provides readily interpretable graphical output, and can be applied to very large scale (in the order of millions of hypotheses) multiple testing problems. Consequently, 'fdrtool' implements a flexible FDR estimation scheme that is unified across different test statistics and variants of FDR. AVAILABILITY: The program is freely available from the Comprehensive R Archive Network (http://cran.r-project.org/) under the terms of the GNU General Public License (version 3 or later). CONTACT: strimmer@uni-leipzig.de.},
  author = {Strimmer, Korbinian},
  date = {2008},
  doi = {10/cmx22k},
  eprint = {18441000},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Strimmer - 2008 - fdrtool A versatile R package for estimating loca.pdf},
  isbn = {3900051070},
  issn = {13674803},
  journaltitle = {Bioinformatics},
  number = {12},
  pages = {1461-1462},
  title = {Fdrtool: {{A}} Versatile {{R}} Package for Estimating Local and Tail Area-Based False Discovery Rates},
  volume = {24}
}

@article{Subramanian2005,
  author = {Subramanian, A. and Tamayo, P. and Mootha, V. K. and Mukherjee, S. and Ebert, B. L. and Gillette, M. A. and Paulovich, A. and Pomeroy, S. L. and Golub, T. R. and Lander, E. S. and Mesirov, J. P.},
  date = {2005-10-25},
  doi = {10/d4qbh8},
  file = {/Users/ryan/Documents/Zotero Library/Subramanian et al. - 2005 - Gene set enrichment analysis A knowledge-based ap.pdf},
  ids = {Subramanian2005},
  issn = {0027-8424, 1091-6490},
  journaltitle = {Proceedings of the National Academy of Sciences},
  langid = {english},
  number = {43},
  pages = {15545-15550},
  shortjournal = {Proceedings of the National Academy of Sciences},
  shorttitle = {Gene Set Enrichment Analysis},
  title = {Gene Set Enrichment Analysis: {{A}} Knowledge-Based Approach for Interpreting Genome-Wide Expression Profiles},
  volume = {102}
}

@article{Sun2013,
  abstract = {BACKGROUND: Differential expression analysis based on "next-generation" sequencing technologies is a fundamental means of studying RNA expression. We recently developed a multi-step normalization method (called TbT) for two-group RNA-seq data with replicates and demonstrated that the statistical methods available in four R packages (edgeR, DESeq, baySeq, and NBPSeq) together with TbT can produce a well-ranked gene list in which true differentially expressed genes (DEGs) are top-ranked and non-DEGs are bottom ranked. However, the advantages of the current TbT method come at the cost of a huge computation time. Moreover, the R packages did not have normalization methods based on such a multi-step strategy.

RESULTS: TCC (an acronym for Tag Count Comparison) is an R package that provides a series of functions for differential expression analysis of tag count data. The package incorporates multi-step normalization methods, whose strategy is to remove potential DEGs before performing the data normalization. The normalization function based on this DEG elimination strategy (DEGES) includes (i) the original TbT method based on DEGES for two-group data with or without replicates, (ii) much faster methods for two-group data with or without replicates, and (iii) methods for multi-group comparison. TCC provides a simple unified interface to perform such analyses with combinations of functions provided by edgeR, DESeq, and baySeq. Additionally, a function for generating simulation data under various conditions and alternative DEGES procedures consisting of functions in the existing packages are provided. Bioinformatics scientists can use TCC to evaluate their methods, and biologists familiar with other R packages can easily learn what is done in TCC.

CONCLUSION: DEGES in TCC is essential for accurate normalization of tag count data, especially when up- and down-regulated DEGs in one of the samples are extremely biased in their number. TCC is useful for analyzing tag count data in various scenarios ranging from unbiased to extremely biased differential expression. TCC is available at http://www.iu.a.u-tokyo.ac.jp/\textasciitilde{}kadota/TCC/ and will appear in Bioconductor (http://bioconductor.org/) from ver. 2.13.},
  author = {Sun, Jianqiang and Nishiyama, Tomoaki and Shimizu, Kentaro and Kadota, Koji},
  date = {2013-01},
  doi = {10/gb8v2t},
  eprint = {23837715},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Sun et al. - 2013 - TCC an R package for comparing tag count data wit.pdf},
  issn = {1471-2105},
  journaltitle = {BMC bioinformatics},
  number = {1},
  pages = {219},
  title = {{{TCC}}: An {{R}} Package for Comparing Tag Count Data with Robust Normalization Strategies.},
  volume = {14}
}

@article{Tan2009,
  abstract = {Joint analysis of transcriptomic and proteomic data taken from the same samples has the potential to elucidate complex biological mechanisms. Most current methods that integrate these datasets allow for the computation of the correlation between a gene and protein but only after a one-to-one matching of genes and proteins is done. However, genes and proteins are connected via biological pathways and their relationship is not necessarily one-to-one. In this paper, we investigate the use of Correlated Factor Analysis (CFA) for modeling the correlation of genome-scale gene and protein data. Unlike existing approaches, CFA considers all possible gene-protein pairs and utilizes all gene and protein information in its modeling framework. The Generalized Singular Value Decomposition (gSVD) is another method which takes into account all available transcriptomic and proteomic data. Comparison is made between CFA and gSVD.},
  author = {Tan, Chuen Seng and Salim, Agus and Ploner, Alexander and Lehti\"o, Janne and Chia, Kee Seng and Pawitan, Yudi},
  date = {2009-01},
  doi = {10/drqjcp},
  eprint = {19723309},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Tan et al. - 2009 - Correlating gene and protein expression data using.pdf},
  isbn = {1471210510},
  issn = {1471-2105},
  journaltitle = {BMC bioinformatics},
  keywords = {Algorithms,Computational Biology,Computational Biology: methods,Gene Expression,Gene Expression Profiling,Gene Expression Profiling: methods,Proteins,Proteins: chemistry,Proteins: genetics,Proteins: metabolism},
  pages = {272},
  title = {Correlating Gene and Protein Expression Data Using {{Correlated Factor Analysis}}.},
  volume = {10}
}

@article{Tan2017,
  author = {Tan, Jie and Huyck, Matthew and Hu, Dongbo and Zelaya, Ren\'e A and Hogan, Deborah A and Casey, S},
  date = {2017},
  doi = {10/gcw9f6},
  file = {/Users/ryan/Documents/Zotero Library/Tan et al. - 2017 - ADAGE signature analysis  differential expression.pdf},
  title = {{{ADAGE}} Signature Analysis : Differential Expression Analysis with Data-Defined Gene Sets}
}

@article{Tanner2007,
  abstract = {Annotation of protein-coding genes is a key goal of genome sequencing projects. In spite of tremendous recent advances in computational gene finding, comprehensive annotation remains a challenge. Peptide mass spectrometry is a powerful tool for researching the dynamic proteome and suggests an attractive approach to discover and validate protein-coding genes. We present algorithms to construct and efficiently search spectra against a genomic database, with no prior knowledge of encoded proteins. By searching a corpus of 18.5 million tandem mass spectra (MS/MS) from human proteomic samples, we validate 39,000 exons and 11,000 introns at the level of translation. We present translation-level evidence for novel or extended exons in 16 genes, confirm translation of 224 hypothetical proteins, and discover or confirm over 40 alternative splicing events. Polymorphisms are efficiently encoded in our database, allowing us to observe variant alleles for 308 coding SNPs. Finally, we demonstrate the use of mass spectrometry to improve automated gene prediction, adding 800 correct exons to our predictions using a simple rescoring strategy. Our results demonstrate that proteomic profiling should play a role in any genome sequencing project.},
  author = {Tanner, Stephen and Shen, Zhouxin and Ng, Julio and Florea, Liliana and Guig\'o, Roderic and Briggs, Steven P and Bafna, Vineet},
  date = {2007-02},
  doi = {10/bdfs97},
  eprint = {17189379},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Tanner et al. - 2007 - Improving gene annotation using peptide mass spect.pdf},
  issn = {1088-9051},
  journaltitle = {Genome research},
  keywords = {Algorithms,Alternative Splicing,Amino Acid Sequence,Databases; Protein,Exons,Genetic Techniques,Humans,Introns,Mass Spectrometry,Mass Spectrometry: methods,Molecular Sequence Data,Peptides,Peptides: chemistry,Peptides: genetics,Polymorphism; Single Nucleotide,Protein Array Analysis,Protein Array Analysis: methods,Protein Array Analysis: statistics & numerical dat,Proteomics,Proteomics: methods,Proteomics: statistics & numerical data,Sequence Alignment},
  number = {2},
  pages = {231-9},
  title = {Improving Gene Annotation Using Peptide Mass Spectrometry.},
  volume = {17}
}

@article{Tarca2009,
  abstract = {MOTIVATION:Gene expression class comparison studies may identify hundreds or thousands of genes as differentially expressed (DE) between sample groups. Gaining biological insight from the result of such experiments can be approached, for instance, by identifying the signaling pathways impacted by the observed changes. Most of the existing pathway analysis methods focus on either the number of DE genes observed in a given pathway (enrichment analysis methods), or on the correlation between the pathway genes and the class of the samples (functional class scoring methods). Both approaches treat the pathways as simple sets of genes, disregarding the complex gene interactions that these pathways are built to describe.\textbackslash{}n\textbackslash{}nRESULTS:We describe a novel signaling pathway impact analysis (SPIA) that combines the evidence obtained from the classical enrichment analysis with a novel type of evidence, which measures the actual perturbation on a given pathway under a given condition. A bootstrap procedure is used to assess the significance of the observed total pathway perturbation. Using simulations we show that the evidence derived from perturbations is independent of the pathway enrichment evidence. This allows us to calculate a global pathway significance P-value, which combines the enrichment and perturbation P-values. We illustrate the capabilities of the novel method on four real datasets. The results obtained on these data show that SPIA has better specificity and more sensitivity than several widely used pathway analysis methods.\textbackslash{}n\textbackslash{}nAVAILABILITY:SPIA was implemented as an R package available at http://vortex.cs.wayne.edu/ontoexpress/},
  author = {Tarca, Adi Laurentiu and Draghici, Sorin and Khatri, Purvesh and Hassan, Sonia S. and Mittal, Pooja and Kim, Jung-sun and Kim, Chong Jai and Kusanovic, Juan Pedro and Romero, Roberto},
  date = {2009-01-01},
  doi = {10/bxg2p2},
  eprint = {18990722},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Tarca et al. - 2009 - A novel signaling pathway impact analysis.pdf},
  isbn = {1367-4811 (Electronic)\textbackslash{}n1367-4803 (Linking)},
  issn = {1460-2059},
  journaltitle = {Bioinformatics},
  number = {1},
  pages = {75-82},
  title = {A Novel Signaling Pathway Impact Analysis},
  volume = {25}
}

@article{Taudt2016,
  abstract = {Post-translational modifications of histone residue tails are an important component of genome regulation. It is becoming increasingly clear that the combinatorial presence and absence of various modifications define discrete chromatin states which determine the functional properties of a locus. An emerging experimental goal is to compare genome-wide chromatin state maps across different conditions, such as experimental treatments, cell-types or developmental time points. Here we present chromstaR, an algorithm for the computational inference of combinatorial chromatin state dynamics across an arbitrary number of conditions. ChromstaR uses a multivariate Hidden Markov Model to assign every genomic region to a discrete combinatorial chromatin state based on the presence/absence of each modification in every condition. This interpretation makes it easy to relate the inferred chromatin states back to the underlying histone modification patterns. Moreover, the algorithm computes the number of combinatorial chromatin states that are present in the genome without having to specify them a priori, thus providing an unbiased picture of their genome-wide frequencies. We demonstrate the advantages of chromstaR in the context of three common experimental data scenarios. First, we study how different histone modifications combine to form combinatorial chromatin states in a single tissue. Second, we infer genome-wide patterns of combinatorial state differences between two cell types or conditions. Finally, we study the dynamics of combinatorial chromatin states during tissue differentiation involving up to six differentiation points. chromstaR is a versatile computational tool that facilitates a deeper biological understanding of chromatin organization and dynamics. The algorithm is written in C++ and freely availableas an R-package at https://github.com/ataudt/chromstaR.},
  author = {Taudt, Aaron and Nguyen, Minh Anh and Heinig, Matthias and Johannes, Frank and Colome-Tatche, Maria},
  date = {2016},
  doi = {10/ggcxmt},
  file = {/Users/ryan/Documents/Zotero Library/Taudt et al. - 2016 - chromstaR Tracking combinatorial chromatin state .pdf},
  journaltitle = {bioRxiv},
  keywords = {chip-seq,chromatin state map,computational biology,epigenetics,histone modification},
  pages = {038612},
  title = {{{chromstaR}}: {{Tracking}} Combinatorial Chromatin State Dynamics in Space and Time}
}

@report{Tchourine2017,
  abstract = {Abstract
          
            Inference of eukaryotic transcription regulatory networks remains challenging due to the large number of regu-lators, combinatorial interactions, and redundant pathways. Even in the model system
            Saccharomyces cerevisiae
            , inference has performed poorly. Most existing inference algorithms ignore crucial regulatory components, like RNA stability and post-transcriptional modulation of regulators. Here we demonstrate that explicitly modeling tran-scription factor activity and RNA half-lives during inference of a genome-wide transcription regulatory network in yeast not only advances prediction performance, but also produces new insights into gene-and condition-specific variation of RNA stability. We curated a high quality gold standard reference network that we use for priors on network structure and model validation. We incorporate variation of RNA half-lives into the
            Inferelator
            inference framework, and show improved performance over previously described algorithms and over implementations of the algorithm that do not model RNA degradation. We recapitulate known condition-and gene-specific trends in RNA half-lives, and make new predictions about RNA half-lives that are confirmed by experimental data.},
  author = {Tchourine, Konstantine and Vogel, Christine and Bonneau, Richard},
  date = {2017-01-31},
  doi = {10/dd8q},
  file = {/Users/ryan/Documents/Zotero Library/Tchourine et al. - 2017 - Explicit Modeling of RNA Stability Improves Large-.pdf},
  ids = {Tchourine2017},
  institution = {{Biophysics}},
  keywords = {⛔ No DOI found},
  langid = {english},
  title = {Explicit {{Modeling}} of {{RNA Stability Improves Large}}-{{Scale Inference}} of {{Transcription Regulation}}},
  type = {preprint},
  url = {http://biorxiv.org/lookup/doi/10.1101/104885},
  urldate = {2019-11-14}
}

@article{Tekes2011,
  abstract = {The RNA synthesis machinery of vesicular stomatitis virus (VSV) comprises the genomic RNA encapsidated by the viral nucleocapsid protein (N) and associated with the RNA dependent RNA polymerase, the viral components of which are a large protein (L) and an accessory phosphoprotein (P). The 241 kDa L protein contains all the enzymatic activities necessary for synthesis of the viral mRNAs, including capping, cap methylation and polyadenylation. Those RNA processing reactions are intimately coordinated with nucleotide polymerization such that failure to cap results in termination of transcription and failure to methylate can result in hyper polyadenylation. The mRNA processing reactions thus serve as a critical check point in viral RNA synthesis which may control the synthesis of incorrectly modified RNAs. Here, we report the length at which viral transcripts first gain access to the capping machinery during synthesis. By reconstitution of transcription in vitro with highly purified recombinant polymerase and engineered templates in which we omitted sites for incorporation of UTP, we found that transcripts that were 30-nucleotides in length were uncapped, whereas those that were 31-nucleotides in length contained a cap structure. The minimal RNA length required for mRNA cap addition was also sufficient for methylation since the 31-nucleotide long transcripts were methylated at both ribose-2'-O and guanine-N-7 positions. This work provides insights into the spatial relationship between the active sites for the RNA dependent RNA polymerase and polyribonucleotidyltransferase responsible for capping of the viral RNA. We combine the present findings with our recently described electron microscopic structure of the VSV polymerase and propose a model of how the spatial arrangement of the capping activities of L may influence nucleotide polymerization.},
  author = {Tekes, Gergely and a Rahmeh, Amal and Whelan, Sean P J},
  date = {2011-06},
  doi = {10/fbxjb3},
  eprint = {21655110},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Tekes et al. - 2011 - A freeze frame view of vesicular stomatitis virus .pdf},
  issn = {1553-7374},
  journaltitle = {PLoS pathogens},
  keywords = {5' Flanking Region,5' Flanking Region: genetics,Animals,Cells; Cultured,Guanine,Guanine: metabolism,Methylation,Nucleocapsid,Nucleocapsid: genetics,Organisms; Genetically Modified,Ribose,Ribose: metabolism,RNA Caps,RNA Caps: genetics,RNA Caps: metabolism,RNA Replicase,RNA Replicase: genetics,RNA Replicase: metabolism,RNA; Messenger,RNA; Messenger: genetics,RNA; Messenger: metabolism,RNA; Viral,RNA; Viral: genetics,RNA; Viral: metabolism,Spodoptera,Transcription; Genetic,Uridine Triphosphate,Uridine Triphosphate: metabolism,Vesicular Stomatitis,Vesicular Stomatitis: virology,Vesiculovirus,Vesiculovirus: genetics,Vesiculovirus: metabolism,Viral Nonstructural Proteins,Viral Nonstructural Proteins: genetics,Viral Nonstructural Proteins: metabolism,Viral Proteins,Viral Proteins: genetics,Viral Proteins: metabolism,Virus Replication,Virus Replication: genetics},
  number = {6},
  pages = {e1002073},
  title = {A Freeze Frame View of Vesicular Stomatitis Virus Transcription Defines a Minimal Length of {{RNA}} for 5' Processing.},
  volume = {7}
}

@report{Teng2016,
  abstract = {Abstract
          The main application of ChIP-seq technology is the detection of genomic regions that bind to a protein of interest. A large part of functional genomics public catalogs are based on ChIP-seq data. These catalogs rely on peak calling algorithms that infer protein-binding sites by detecting genomic regions associated with more mapped reads (coverage) than expected by chance as a result of the experimental protocol's lack of perfect specificity. We find that GC-content bias accounts for substantial variability in the observed coverage for ChIP-Seq experiments and that this variability leads to false-positive peak calls. More concerning is that GC-effect varies across experiments, with the effect strong enough to result in a substantial number of peaks called differently when different laboratories perform experiments on the same cell-line. However, accounting for GC-content in ChIP-Seq is challenging because the binding sites of interest tend to be more common in high GC-content regions, which confounds real biological signal with the unwanted variability. To account for this challenge we introduce a statistical approach that accounts for GC-effects on both non-specific noise and signal induced by the binding site. The method can be used to account for this bias in binding quantification as well to improve existing peak calling algorithms. We use this approach to show a reduction in false positive peaks as well as improved consistency across laboratories.},
  author = {Teng, Mingxiang and Irizarry, Rafael A.},
  date = {2016-11-30},
  doi = {10/dd8s},
  file = {/Users/ryan/Documents/Zotero Library/Teng and Irizarry - 2016 - Accounting for GC-content bias reduces systematic .pdf},
  institution = {{Genomics}},
  keywords = {⛔ No DOI found},
  langid = {english},
  title = {Accounting for {{GC}}-Content Bias Reduces Systematic Errors and Batch Effects in {{ChIP}}-{{Seq}} Data},
  type = {preprint},
  url = {http://biorxiv.org/lookup/doi/10.1101/090704},
  urldate = {2019-11-14}
}

@article{Teng2016a,
  abstract = {Obtaining RNA-seq measurements involves a complex data analytical process with a large number of competing algorithms as options. There is much debate about which of these methods provides the best approach. Unfortunately, it is currently difficult to evaluate their performance due in part to a lack of sensitive assessment metrics. We present a series of statistical summaries and plots to evaluate the performance in terms of specificity and sensitivity, available as a R/Bioconductor package ( http://bioconductor.org/packages/rnaseqcomp ). Using two independent datasets, we assessed seven competing pipelines. Performance was generally poor, with two methods clearly underperforming and RSEM slightly outperforming the rest.},
  author = {Teng, Mingxiang and Love, Michael I. and Davis, Carrie A. and Djebali, Sarah and Dobin, Alexander and Graveley, Brenton R. and Li, Sheng and Mason, Christopher E. and Olson, Sara and Pervouchine, Dmitri and Sloan, Cricket A. and Wei, Xintao and Zhan, Lijun and Irizarry, Rafael A.},
  date = {2016},
  doi = {10/gfj2zz},
  eprint = {27107712},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Teng et al. - 2016 - A benchmark for RNA-seq quantification pipelines..pdf},
  isbn = {1474760X (Electronic)},
  issn = {1474-760X},
  journaltitle = {Genome biology},
  number = {1},
  pages = {74},
  title = {A Benchmark for {{RNA}}-Seq Quantification Pipelines.},
  volume = {17}
}

@article{Teschendorff2011,
  abstract = {Motivation: A common difficulty in large-scale microarray studies is the presence of confounding factors, which may significantly skew estimates of statistical significance, cause unreliable feature selection and high false negative rates. To deal with these difficulties, an algorithmic framework known as Surrogate Variable Analysis (SVA) was recently proposed. Results: Based on the notion that data can be viewed as an interference pattern, reflecting the superposition of independent effects and random noise, we present a modified SVA, called Independent Surrogate Variable Analysis (ISVA), to identify features correlating with a phenotype of interest in the presence of potential confounding factors. Using simulated data, we show that ISVA performs well in identifying confounders as well as outperforming methods which do not adjust for confounding. Using four large-scale Illumina Infinium DNA methylation datasets subject to low signal to noise ratios and substantial confounding by beadchip effects and variable bisulfite conversion efficiency, we show that ISVA improves the identifiability of confounders and that this enables a framework for feature selection that is more robust to model misspecification and heterogeneous phenotypes. Finally, we demonstrate similar improvements of ISVA across four mRNA expression datasets. Thus, ISVA should be useful as a feature selection tool in studies that are subject to confounding. \textcopyright{} The Author 2011. Published by Oxford University Press. All rights reserved.},
  author = {Teschendorff, Andrew E. and Zhuang, Joanna and Widschwendter, Martin},
  date = {2011-06-01},
  doi = {10/chssjw},
  eprint = {21471010},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Teschendorff et al. - 2011 - Independent surrogate variable analysis to deconvo.pdf},
  isbn = {1367-4811 (Electronic)\textbackslash{}n1367-4803 (Linking)},
  issn = {13674803},
  journaltitle = {Bioinformatics},
  number = {11},
  pages = {1496-1505},
  title = {Independent Surrogate Variable Analysis to Deconvolve Confounding Factors in Large-Scale Microarray Profiling Studies},
  volume = {27}
}

@article{tHoen2013,
  author = {'t Hoen, Peter a C and Friedl\"ander, Marc R and Alml\"of, Jonas and Sammeth, Michael and Pulyakhina, Irina and Anvar, Seyed Yahya and Laros, Jeroen F J and Buermans, Henk P J and Karlberg, Olof and Br\"annvall, Mathias and van Ommen, Gert-Jan B and Estivill, Xavier and Guig\'o, Roderic and Syv\"anen, Ann-Christine and Gut, Ivo G and Dermitzakis, Emmanouil T and Antonorakis, Stylianos E and Brazma, Alvis and Flicek, Paul and Schreiber, Stefan and Rosenstiel, Philip and Meitinger, Thomas and Strom, Tim M and Lehrach, Hans and Sudbrak, Ralf and Carracedo, Angel and van Iterson, Maarten and Monlong, Jean and Lizano, Esther and Bertier, Gabrielle and Ferreira, Pedro G and Ribeca, Paolo and Griebel, Thasso and Beltran, Sergi and Gut, Marta and Kahlem, Katja and Lappalainen, Tuuli and Giger, Thomas and Ongen, Halit and Padioleau, Ismael and Kilpinen, Helena and Gonz\`alez-Porta, Mar and Kurbatova, Natalja and Tikhonov, Andrew and Greger, Liliana and Barann, Matthias and Esser, Daniela and H\"asler, Robert and Wieland, Thomas and Schwarzmayr, Thomas and Sultan, Marc and Amstislavskiy, Vyacheslav and den Dunnen, Johan T},
  date = {2013-09-15},
  doi = {10/f2zksj},
  file = {/Users/ryan/Documents/Zotero Library/'t Hoen et al. - 2013 - Reproducibility of high-throughput mRNA and small .pdf},
  issn = {1087-0156},
  issue = {September},
  journaltitle = {Nature Biotechnology},
  options = {useprefix=true},
  title = {Reproducibility of High-Throughput {{mRNA}} and Small {{RNA}} Sequencing across Laboratories}
}

@misc{Thompson2008,
  author = {Thompson, Ryan C.},
  date = {2008-05-07},
  file = {/Users/ryan/Documents/Zotero Library/Thompson - 2008 - The Sources and Limits of Geometric Rigor from Euc.pdf},
  note = {Institution: University of Virginia},
  title = {The {{Sources}} and {{Limits}} of {{Geometric Rigor}} from {{Euclid Through Descartes}}},
  url = {http://darwinawardwinner.github.io/resume/examples/UVa/math-history-paper.pdf}
}

@misc{Thompson2009,
  abstract = {Here we present Contig Farmer, a tool for improving the length and depth of coverage of contigs gener- ated from a database of short sequence reads. Contig Farmer works without assembling the entire database and has only modest hardware requirements. The underlying methodology of Contig Farmer is iterative growth of seed contigs using repeated search and assembly. The utility of Contig Farmer is demonstrated on the sequences in TOBFAC, the database of tobacco transcription factors. Contig Farmer successfully grew the TOBFAC contigs, both in length and in depth of coverage, to yield a larger, higher-quality set of contigs.},
  author = {Thompson, Ryan C. and Rushton, Paul J. and Laudeman, Tom W. and Timko, Michael P.},
  date = {2009-06},
  file = {/Users/ryan/Documents/Zotero Library/Thompson et al. - 2009 - Contig Farmer  A tool for extracting maximal-leng.pdf},
  note = {Institution: University of Virginia},
  title = {Contig {{Farmer}} : {{A}} Tool for Extracting Maximal-Length Contiguous Sequences from a Database of Short Sequence Reads ({{Undergraduate Thesis}})},
  url = {http://darwinawardwinner.github.io/resume/examples/UVa/contigfarmer.pdf}
}

@thesis{Thompson2019,
  abstract = {Transplant rejection mediated by adaptive immune response is the major challenge to long-term graft survival. Rejection is treated with immune suppressive drugs, but early diagnosis is essential for effective treatment. Memory lymphocytes are known to resist immune suppression, but the precise regulatory mechanisms underlying immune memory are still poorly understood. High-throughput genomic assays such as microarrays, RNA-seq, and ChIP-seq are heavily used in the study of immunology and transplant rejection. Here we present 3 analyses of such assays in this context. First, we re-analyze a large data set consisting of H3K4me2, H3K4me3, and H3K27me3 ChIP-seq data and RNA-seq data in na\"ive and memory CD4+ T-cells using modern bioinformatics methods designed to address deficiencies in the data and extend the analysis in several new directions. All 3 histone marks are found to occur in broad regions and are enriched near promoters, but the radius of promoter enrichment is found to be larger for H3K27me3. We observe that both gene expression and promoter histone methylation in na\"ive and memory cells converges on a common signature 14 days after activation, consistent with differentiation of na\"ive cells into memory cells. The location of histone modifications within the promoter is also found to be important, with asymmetric associations with gene expression for peaks located the same distance up- or downstream of the TSS. Second, we demonstrate the effectiveness of fRMA as a single-channel normalization for using expression arrays to diagnose transplant rejection in a clinical diagnostic setting, and we develop a custom fRMA normalization for a previously unsupported array platform. For methylation arrays, we adapt methods designed for RNA-seq to improve the sensitivity of differential methylation analysis by modeling the heteroskedasticity inherent in the data. Finally, we present and validate a novel method for RNA-seq of cynomolgus monkey blood samples using complementary oligonucleotides to prevent wasteful over-sequencing of globin genes. These results all demonstrate the usefulness of a toolbox full of flexible and modular analysis methods in analyzing complex high-throughput assays in contexts ranging from basic science to translational medicine.},
  author = {Thompson, Ryan C.},
  date = {2019},
  file = {/Users/ryan/Documents/Zotero Library/Thompson - 2019 - Bioinformatic analysis of complex , high-throughpu.pdf},
  institution = {{The Scripps Research Institute}},
  title = {Bioinformatic Analysis of Complex , High-Throughput Genomic and Epigenomic Data in the Context of {{CD4}}+ {{T}}-Cell Differentiation and Diagnosis and Treatment of Transplant Rejection}
}

@unpublished{Thompson2019a,
  author = {Thompson, Ryan C. and Lamere, Sarah A. and Salomon, Daniel R.},
  date = {2019},
  keywords = {\#nosource},
  location = {{La Jolla, CA}},
  note = {Institution: The Scripps Research Institute},
  title = {Reproducible Genome-Wide Epigenetic Analysis of {{H3K4}} and {{H3K27}} Methylation in Na\"ive and Memory {{CD4}}+ {{T}}-Cell Activation. ({{In}} Preparation)}
}

@software{thompsonReproducibleReanalysisCombined2019,
  abstract = {This is the code for a re-analysis of a GEO dataset that I originally analyzed for this paper using statistical methods that were not yet available at the time, such as the csaw Bioconductor package, which provides a principled way to normalize windowed counts of ChIP-Seq reads and test them for differential binding. The original paper only analyzed binding within pre-defined promoter regions. In addition, some improvements have also been made to the RNA-seq analysis using newer features of limma such as quality weights.},
  author = {Thompson, Ryan C.},
  date = {2019-08-09T16:53:58Z},
  ids = {gh-cd4-csaw},
  keywords = {⛔ No DOI found,bioconductor,bioinformatics-pipeline,chip-seq,chipseq,r,reproducible-research,rna-seq,rnaseq},
  location = {{La Jolla, CA}},
  organization = {{The Scripps Research Institute}},
  origdate = {2016-05-03T21:32:00Z},
  title = {Reproducible Reanalysis of a Combined {{ChIP}}-{{Seq}} \& {{RNA}}-{{Seq}} Data Set},
  url = {https://github.com/DarwinAwardWinner/CD4-csaw},
  urldate = {2019-11-14}
}

@software{thompsonWorkflowDownloadGenerate2018,
  abstract = {This is a Snakemake workflow for downloading the hg38 genome \& transcriptome and building indices for various alignment and mapping tools, in a completely automated fashion.},
  author = {Thompson, Ryan C.},
  date = {2018-01-20T10:54:21Z},
  ids = {gh-hg38-ref},
  keywords = {⛔ No DOI found},
  location = {{La Jolla, CA}},
  organization = {{The Scripps Research Institute}},
  origdate = {2016-07-16T02:46:13Z},
  title = {Workflow to Download/Generate Various Mapping Indices for the Human Hg38 Genome},
  url = {https://github.com/DarwinAwardWinner/hg38-ref},
  urldate = {2019-11-14}
}

@article{Thomson2010,
  abstract = {CpG islands (CGIs) are prominent in the mammalian genome owing to their GC-rich base composition and high density of CpG dinucleotides. Most human gene promoters are embedded within CGIs that lack DNA methylation and coincide with sites of histone H3 lysine 4 trimethylation (H3K4me3), irrespective of transcriptional activity. In spite of these intriguing correlations, the functional significance of non-methylated CGI sequences with respect to chromatin structure and transcription is unknown. By performing a search for proteins that are common to all CGIs, here we show high enrichment for Cfp1, which selectively binds to non-methylated CpGs in vitro. Chromatin immunoprecipitation of a mono-allelically methylated CGI confirmed that Cfp1 specifically associates with non-methylated CpG sites in vivo. High throughput sequencing of Cfp1-bound chromatin identified a notable concordance with non-methylated CGIs and sites of H3K4me3 in the mouse brain. Levels of H3K4me3 at CGIs were markedly reduced in Cfp1-depleted cells, consistent with the finding that Cfp1 associates with the H3K4 methyltransferase Setd1 (refs 7, 8). To test whether non-methylated CpG-dense sequences are sufficient to establish domains of H3K4me3, we analysed artificial CpG clusters that were integrated into the mouse genome. Despite the absence of promoters, the insertions recruited Cfp1 and created new peaks of H3K4me3. The data indicate that a primary function of non-methylated CGIs is to genetically influence the local chromatin modification state by interaction with Cfp1 and perhaps other CpG-binding proteins.},
  author = {Thomson, John P and Skene, Peter J and Selfridge, Jim and Clouaire, Thomas and Guy, Jacky and Webb, Shaun and Kerr, Alastair R W and Deaton, Aim\'ee and Andrews, Rob and James, Keith D and Turner, Daniel J and Illingworth, Robert and Bird, Adrian},
  date = {2010-04-15},
  doi = {10/brnhq4},
  eprint = {20393567},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Thomson et al. - 2010 - CpG islands influence chromatin structure via the .pdf},
  issn = {1476-4687},
  journaltitle = {Nature},
  keywords = {Alleles,Animals,Brain,Brain: cytology,Cell Line,Chromatin,Chromatin Assembly and Disassembly,Chromatin Immunoprecipitation,Chromatin: genetics,Chromatin: metabolism,CpG Islands,CpG Islands: genetics,DNA Methylation,Genome,Genome: genetics,Histone-Lysine N-Methyltransferase,Histone-Lysine N-Methyltransferase: metabolism,Histones,Histones: chemistry,Histones: metabolism,Methylation,Mice,NIH 3T3 Cells,Promoter Regions; Genetic,Trans-Activators,Trans-Activators: chemistry,Trans-Activators: deficiency,Trans-Activators: genetics,Trans-Activators: metabolism,Zinc Fingers},
  number = {7291},
  pages = {1082-6},
  title = {{{CpG}} Islands Influence Chromatin Structure via the {{CpG}}-Binding Protein {{Cfp1}}.},
  volume = {464}
}

@article{Tian2005a,
  abstract = {Accurate and rapid identification of perturbed pathways through the analysis of genome-wide expression profiles facilitates the generation of biological hypotheses. We propose a statistical framework for determining whether a specified group of genes for a pathway has a coordinated association with a phenotype of interest. Several issues on proper hypothesis-testing procedures are clarified. In particular, it is shown that the differences in the correlation structure of each set of genes can lead to a biased comparison among gene sets unless a normalization procedure is applied. We propose statistical tests for two important but different aspects of association for each group of genes. This approach has more statistical power than currently available methods and can result in the discovery of statistically significant pathways that are not detected by other methods. This method is applied to data sets involving diabetes, inflammatory myopathies, and Alzheimer's disease, using gene sets we compiled from various public databases. In the case of inflammatory myopathies, we have correctly identified the known cytotoxic T lymphocyte-mediated autoimmunity in inclusion body myositis. Furthermore, we predicted the presence of dendritic cells in inclusion body myositis and of an IFN-alpha/beta response in dermatomyositis, neither of which was previously described. These predictions have been subsequently corroborated by immunohistochemistry.},
  author = {Tian, Lu and a Greenberg, Steven and Kong, Sek Won and Altschuler, Josiah and Kohane, Isaac S and Park, Peter J},
  date = {2005-09-20},
  doi = {10/cg8w3z},
  eprint = {16174746},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Tian et al. - 2005 - Discovering statistically significant pathways in .pdf},
  issn = {0027-8424},
  journaltitle = {Proceedings of the National Academy of Sciences of the United States of America},
  keywords = {Algorithms,Alzheimer Disease,Alzheimer Disease: genetics,Alzheimer Disease: metabolism,Animals,Autoimmunity,Autoimmunity: genetics,Autoimmunity: physiology,Databases; Genetic,Dermatomyositis,Dermatomyositis: genetics,Dermatomyositis: metabolism,Gene Expression Profiling,Gene Expression Profiling: methods,Gene Expression Regulation,Gene Expression Regulation: physiology,Humans,Interferon-alpha,Interferon-alpha: genetics,Interferon-alpha: metabolism,Interferon-beta,Interferon-beta: genetics,Interferon-beta: metabolism,Models; Genetic,Myositis,Myositis; Inclusion Body,Myositis; Inclusion Body: genetics,Myositis; Inclusion Body: metabolism,Myositis: genetics,Myositis: metabolism,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,Predictive Value of Tests,T-Lymphocytes,T-Lymphocytes: metabolism,Transcription Factors,Transcription Factors: metabolism},
  number = {38},
  pages = {13544-9},
  title = {Discovering Statistically Significant Pathways in Expression Profiling Studies.},
  volume = {102}
}

@article{Tibshirani2002,
  abstract = {We have devised an approach to cancer class prediction from gene expression profiling, based on an enhancement of the simple nearest prototype (centroid) classifier. We shrink the prototypes and hence obtain a classifier that is often more accurate than competing methods. Our method of "nearest shrunken centroids" identifies subsets of genes that best characterize each class. The technique is general and can be used in many other classification problems. To demonstrate its effectiveness, we show that the method was highly efficient in finding genes for classifying small round blue cell tumors and leukemias.},
  author = {Tibshirani, Robert and Hastie, Trevor and Narasimhan, Balasubramanian and Chu, Gilbert},
  date = {2002-05-14},
  doi = {10/d2h5n3},
  eprint = {12011421},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Tibshirani et al. - 2002 - Diagnosis of multiple cancer types by shrunken cen.pdf},
  issn = {0027-8424},
  journaltitle = {Proceedings of the National Academy of Sciences of the United States of America},
  keywords = {Child,Discriminant Analysis,DNA; Neoplasm,DNA; Neoplasm: analysis,Gene Expression,Gene Expression Profiling,Humans,Neoplasms,Neoplasms: classification,Neoplasms: diagnosis,Neoplasms: genetics,Precursor Cell Lymphoblastic Leukemia-Lymphoma,Precursor Cell Lymphoblastic Leukemia-Lymphoma: cl,Precursor Cell Lymphoblastic Leukemia-Lymphoma: di,Precursor Cell Lymphoblastic Leukemia-Lymphoma: ge,Probability},
  number = {10},
  pages = {6567-72},
  title = {Diagnosis of Multiple Cancer Types by Shrunken Centroids of Gene Expression.},
  volume = {99}
}

@article{Timko2008,
  abstract = {Background: Cowpea, Vigna unguiculata (L.) Walp., is one of the most important food and forage legumes in the semi-arid tropics because of its drought tolerance and ability to grow on poor quality soils. Approximately 80\% of cowpea production takes place in the dry savannahs of tropical West and Central Africa, mostly by poor subsistence farmers. Despite its economic and social importance in the developing world, cowpea remains to a large extent an underexploited crop. Among the major goals of cowpea breeding and improvement programs is the stacking of desirable agronomic traits, such as disease and pest resistance and response to abiotic stresses. Implementation of marker-assisted selection and breeding programs is severely limited by a paucity of trait-linked markers and a general lack of information on gene structure and organization. With a nuclear genome size estimated at \~620 Mb, the cowpea genome is an ideal target for reduced representation sequencing. Results: We report here the sequencing and analysis of the gene-rich, hypomethylated portion of the cowpea genome selectively cloned by methylation filtration (MF) technology. Over 250,000 gene-space sequence reads (GSRs) with an average length of 610 bp were generated, yielding \~160 Mb of sequence information. The GSRs were assembled, annotated by BLAST homology searches of four public protein annotation databases and four plant proteomes (A. thaliana, M. truncatula, O. sativa, and P. trichocarpa), and analyzed using various domain and gene modeling tools. A total of 41,260 GSR assemblies and singletons were annotated, of which 19,786 have unique GenBank accession numbers. Within the GSR dataset, 29\% of the sequences were annotated using the Arabidopsis Gene Ontology (GO) with the largest categories of assigned function being catalytic activity and metabolic processes, groups that include the majority of cellular enzymes and components of amino acid, carbohydrate and lipid metabolism. A total of 5,888 GSRs had homology to genes encoding transcription factors (TFs) and transcription associated factors (TAFs) representing about 5\% of the total annotated sequences in the dataset. Sixty-two (62) of the 64 well-characterized plant transcription factor (TF) gene families are represented in the cowpea GSRs, and these families are of similar size and phylogenetic organization to those characterized in other plants. The cowpea GSRs also provides a rich source of genes involved in photoperiodic control, symbiosis, and defense-related responses. Comparisons to available databases revealed that about 74\% of cowpea ESTs and 70\% of all legume ESTs were represented in the GSR dataset. As approximately 12\% of all GSRs contain an identifiable simple-sequence repeat, the dataset is a powerful resource for the design of microsatellite markers. Conclusion: The availability of extensive publicly available genomic data for cowpea, a non-model legume with significant importance in the developing world, represents a significant step forward in legume research. Not only does the gene space sequence enable the detailed analysis of gene structure, gene family organization and phylogenetic relationships within cowpea, but it also facilitates the characterization of syntenic relationships with other cultivated and model legumes, and will contribute to determining patterns of chromosomal evolution in the Leguminosae. The micro and macrosyntenic relationships detected between cowpea and other cultivated and model legumes should simplify the identification of informative markers for marker-assisted trait selection and map-based gene isolation necessary for cowpea improvement. \textcopyright{} 2008 Timko et al; licensee BioMed Central Ltd.},
  author = {Timko, Michael P. and Rushton, Paul J. and Laudeman, Thomas W. and Bokowiec, Marta T. and Chipumuro, Edmond and Cheung, Foo and Town, Christopher D. and Chen, Xianfeng},
  date = {2008},
  doi = {10/cmfwx9},
  file = {/Users/ryan/Documents/Zotero Library/Timko et al. - 2008 - Sequencing and analysis of the gene-rich space of .pdf},
  issn = {14712164},
  journaltitle = {BMC Genomics},
  number = {1},
  pages = {103},
  title = {Sequencing and Analysis of the Gene-Rich Space of Cowpea},
  volume = {9}
}

@article{Togel2010,
  abstract = {Acute kidney injury (AKI) is a common clinical complication, associated with poor outcomes and the development of chronic kidney disease. Despite major advances in the understanding of its pathophysiology, available therapies for AKI are only supportive; therefore, adequate functional recovery from AKI must predominantly rely on the kidney's own reparative ability. An extensive body of preclinical data from our own and from other laboratories has shown that administration of adult multipotent marrow stromal cells (commonly referred to as mesenchymal stem cells [MSCs]), effectively ameliorates experimental AKI by exerting paracrine renoprotective effects and by stimulating tissue repair. Based on these findings, a clinical trial has been conducted to investigate the safety and efficacy of MSCs administered to open-heart surgery patients who are at high risk of postoperative AKI. In this Perspectives article, we discuss some of the early data from this trial and describe potential applications for stem cell therapies in other fields of nephrology.},
  author = {T\"ogel, Florian E and Westenfelder, Christof},
  date = {2010-03},
  doi = {10/fnbpr3},
  eprint = {20186233},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Tögel and Westenfelder - 2010 - Mesenchymal stem cells a new therapeutic tool for.pdf},
  issn = {1759-507X},
  journaltitle = {Nature reviews. Nephrology},
  keywords = {Acute Kidney Injury,Acute Kidney Injury: etiology,Acute Kidney Injury: mortality,Acute Kidney Injury: therapy,acute-kidney-injury,Animals,Clinical Trials; Phase I as Topic,Coronary Artery Bypass,Coronary Artery Bypass: adverse effects,Coronary Artery Bypass: methods,Coronary Disease,Coronary Disease: diagnosis,Coronary Disease: therapy,cyno-project,Disease Models; Animal,Female,Follow-Up Studies,Graft Rejection,Graft Survival,Humans,Kidney Failure; Chronic,Kidney Failure; Chronic: etiology,Kidney Failure; Chronic: physiopathology,Kidney Failure; Chronic: prevention & control,Male,Mesenchymal Stem Cell Transplantation,Mesenchymal Stem Cell Transplantation: adverse eff,Mesenchymal Stem Cell Transplantation: methods,Mice,Rats,Risk Assessment,Survival Rate,Transplantation; Autologous,Treatment Outcome},
  number = {3},
  pages = {179-83},
  title = {Mesenchymal Stem Cells: A New Therapeutic Tool for {{AKI}}.},
  volume = {6}
}

@article{Trapnell2010,
  abstract = {High-throughput mRNA sequencing (RNA-Seq) promises simultaneous transcript discovery and abundance estimation. However, this would require algorithms that are not restricted by prior gene annotations and that account for alternative transcription and splicing. Here we introduce such algorithms in an open-source software program called Cufflinks. To test Cufflinks, we sequenced and analyzed {$>$}430 million paired 75-bp RNA-Seq reads from a mouse myoblast cell line over a differentiation time series. We detected 13,692 known transcripts and 3,724 previously unannotated ones, 62\% of which are supported by independent expression data or by homologous genes in other species. Over the time series, 330 genes showed complete switches in the dominant transcription start site (TSS) or splice isoform, and we observed more subtle shifts in 1,304 other genes. These results suggest that Cufflinks can illuminate the substantial regulatory flexibility and complexity in even this well-studied model of muscle development and that it can improve transcriptome-based genome annotation.},
  author = {Trapnell, Cole and a Williams, Brian and Pertea, Geo and Mortazavi, Ali and Kwan, Gordon and van Baren, Marijke J and Salzberg, Steven L and Wold, Barbara J and Pachter, Lior},
  date = {2010-05-02},
  doi = {10/dhrc2j},
  eprint = {20436464},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Trapnell et al. - 2010 - Transcript assembly and quantification by RNA-Seq .pdf;/Users/ryan/Documents/Zotero Library/Trapnell et al. - 2010 - Transcript assembly and quantification by RNA-Seq 2.pdf},
  issn = {1087-0156},
  journaltitle = {Nature Biotechnology},
  keywords = {Algorithms,Animals,Cell Differentiation,Cell Differentiation: genetics,Cell Line,Gene Expression Profiling,Gene Expression Profiling: methods,Genome,Messenger,Messenger: analysis,Messenger: genetics,Messenger: metabolism,Mice,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,Protein Isoforms,Protein Isoforms: genetics,Protein Isoforms: metabolism,Proto-Oncogene Proteins c-myc,Proto-Oncogene Proteins c-myc: genetics,Proto-Oncogene Proteins c-myc: metabolism,RNA,RNA: methods,Sequence Analysis,Software},
  number = {5},
  options = {useprefix=true},
  pages = {511-515},
  title = {Transcript Assembly and Quantification by {{RNA}}-{{Seq}} Reveals Unannotated Transcripts and Isoform Switching during Cell Differentiation},
  volume = {28}
}

@article{Trapnell2012,
  author = {Trapnell, Cole and Hendrickson, David G and Sauvageau, Martin and Goff, Loyal and Rinn, John L and Pachter, Lior},
  date = {2012-12-09},
  doi = {10/gfghc4},
  file = {/Users/ryan/Documents/Zotero Library/Trapnell et al. - 2012 - Differential analysis of gene regulation at transc.pdf;/Users/ryan/Documents/Zotero Library/Trapnell et al. - 2012 - Differential analysis of gene regulation at transc2.pdf},
  issn = {1087-0156},
  issue = {December},
  journaltitle = {Nature Biotechnology},
  pages = {1-9},
  title = {Differential Analysis of Gene Regulation at Transcript Resolution with {{RNA}}-Seq}
}

@article{Trapnell2012a,
  author = {Trapnell, Cole and Roberts, Adam and Goff, Loyal and Pertea, Geo and Kim, Daehwan and Kelley, David R and Pimentel, Harold and Salzberg, Steven L and Rinn, John L and Pachter, Lior},
  date = {2012-03-01},
  doi = {10/f4pbzd},
  file = {/Users/ryan/Documents/Zotero Library/Trapnell et al. - 2012 - Differential gene and transcript expression analys.pdf},
  issn = {1754-2189},
  journaltitle = {Nature Protocols},
  number = {3},
  pages = {562-578},
  title = {Differential Gene and Transcript Expression Analysis of {{RNA}}-Seq Experiments with {{TopHat}} and {{Cufflinks}}},
  volume = {7}
}

@article{Triff2017,
  abstract = {During colon cancer, epigenetic alterations contribute to the dysregulation of major cellular functions and signaling pathways. Modifications in chromatin signatures such as H3K4me3 and H3K9ac, which are associated with transcriptionally active genes, can lead to genomic instability and perturb the expression of gene sets associated with oncogenic processes. In order to further elucidate early pre-tumorigenic epigenetic molecular events driving CRC, we integrated diverse, genome-wide, epigenetic inputs (by high throughput sequencing of RNA, H3K4me3, and H3K9ac) and compared differentially expressed transcripts (DE) and enriched regions (DER) in an in-vivo rat colon cancer progression model. Carcinogen (AOM) effects were detected genome-wide at the RNA (116 DE genes), K9ac (49 DERs including 24 genes) and K4me3 (7678 DERs including 3792 genes) level. RNA-seq differential expression and pathway analysis indicated that interferon-associated innate immune responses were impacted by AOM exposure. Despite extensive associations between K4me3 DERs and colon tumorigenesis (1210 genes were linked to colorectal carcinoma) including FOXO3, GNAI2, H2AFX, MSH2, NR3C1, PDCD4 and VEGFA, these changes were not reflected at the RNA gene expression level during early cancer progression. Collectively, our results indicate that carcinogen-induced changes in gene K4me3 DERs are harbingers of future transcriptional events, which drive malignant transformation of the colon.},
  author = {Triff, Karen and Pang, Jiahui and Callaway, Evelyn and Ivanov, Ivan and Chapkin, Robert S. and Triff, Karen and McLean, Mathew W. and Ivanov, Ivan and Konganti, Kranti and Zhou, Beiyan and Ivanov, Ivan and McLean, Mathew W.},
  date = {2017},
  doi = {10/gbm8nx},
  file = {/Users/ryan/Documents/Zotero Library/Triff et al. - 2017 - Assessment of histone tail modifications and trans.pdf},
  issn = {1879260X},
  journaltitle = {Biochimica et Biophysica Acta - Molecular Basis of Disease},
  number = {6},
  pages = {1392-1402},
  title = {Assessment of Histone Tail Modifications and Transcriptional Profiling during Colon Cancer Progression Reveals a Global Decrease in {{H3K4me3}} Activity},
  volume = {1863}
}

@article{Troyanskaya2001,
  abstract = {Abstract.  Motivation: Gene expression microarray experiments can generate
  data sets with multiple missing expression values. Unfortunately,
  many algorithms},
  author = {Troyanskaya, Olga and Cantor, Michael and Sherlock, Gavin and Brown, Pat and Hastie, Trevor and Tibshirani, Robert and Botstein, David and Altman, Russ B.},
  date = {2001-06-01},
  doi = {10/bz5w38},
  file = {/Users/ryan/Documents/Zotero Library/Troyanskaya et al. - 2001 - Missing value estimation methods for DNA microarra.pdf;/Users/ryan/Zotero/storage/XK9DLAP7/272365.html},
  issn = {1367-4803},
  journaltitle = {Bioinformatics},
  keywords = {*Algorithms,*Data Interpretation; Statistical,*Mathematical Computing,*Oligonucleotide Array Sequence Analysis statistic,Cell Cycle genetics,Cluster Analysis,Comparative Study,Data Display,Gene Expression,Multigene Family,Saccharomyces cerevisiae genetics,Sensitivity and Specificity,Software,Support; Non U.S. Gov't,Support; U.S. Gov't; Non P.H.S.,Support; U.S. Gov't; P.H.S.},
  langid = {english},
  number = {6},
  pages = {520-525},
  shortjournal = {Bioinformatics},
  title = {Missing Value Estimation Methods for {{DNA}} Microarrays},
  volume = {17}
}

@article{Tsai2010,
  abstract = {Long intergenic noncoding RNAs (lincRNAs) regulate chromatin states and epigenetic inheritance. Here, we show that the lincRNA HOTAIR serves as a scaffold for at least two distinct histone modification complexes. A 5' domain of HOTAIR binds polycomb repressive complex 2 (PRC2), whereas a 3' domain of HOTAIR binds the LSD1/CoREST/REST complex. The ability to tether two distinct complexes enables RNA-mediated assembly of PRC2 and LSD1 and coordinates targeting of PRC2 and LSD1 to chromatin for coupled histone H3 lysine 27 methylation and lysine 4 demethylation. Our results suggest that lincRNAs may serve as scaffolds by providing binding surfaces to assemble select histone modification enzymes, thereby specifying the pattern of histone modifications on target genes.},
  author = {Tsai, M.-C. and Manor, Ohad and Wan, Yue and Mosammaparast, Nima and Wang, Jordon K and Lan, Fei and Shi, Yang and Segal, Eran and Chang, Howard Y},
  date = {2010-08-06},
  doi = {10/cqskdj},
  eprint = {20616235},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Tsai et al. - 2010 - Long Noncoding RNA as Modular Scaffold of Histone .pdf},
  isbn = {1095-9203 (Electronic)\textbackslash{}r0036-8075 (Linking)},
  issn = {0036-8075},
  journaltitle = {Science},
  number = {5992},
  pages = {689-693},
  title = {Long {{Noncoding RNA}} as {{Modular Scaffold}} of {{Histone Modification Complexes}}},
  volume = {329}
}

@article{Tsutakawa2011,
  abstract = {Flap endonuclease (FEN1), essential for DNA replication and repair, removes RNA and DNA 5' flaps. FEN1 5' nuclease superfamily members acting in nucleotide excision repair (XPG), mismatch repair (EXO1), and homologous recombination (GEN1) paradoxically incise structurally distinct bubbles, ends, or Holliday junctions, respectively. Here, structural and functional analyses of human FEN1:DNA complexes show structure-specific, sequence-independent recognition for nicked dsDNA bent 100\textdegree{} with unpaired 3' and 5' flaps. Above the active site, a helical cap over a gateway formed by two helices enforces ssDNA threading and specificity for free 5' ends. Crystallographic analyses of product and substrate complexes reveal that dsDNA binding and bending, the ssDNA gateway, and double-base unpairing flanking the scissile phosphate control precise flap incision by the two-metal-ion active site. Superfamily conserved motifs bind and open dsDNA; direct the target region into the helical gateway, permitting only nonbase-paired oligonucleotides active site access; and support a unified understanding of superfamily substrate specificity.},
  author = {Tsutakawa, Susan E and Classen, Scott and Chapados, Brian R and Arvai, Andrew S and Finger, L David and Guenther, Grant and Tomlinson, Christopher G and Thompson, Peter and Sarker, Altaf H and Shen, Binghui and Cooper, Priscilla K and a Grasby, Jane and a Tainer, John},
  date = {2011-04-15},
  doi = {10/bpxc4g},
  eprint = {21496641},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Tsutakawa et al. - 2011 - Human flap endonuclease structures, DNA double-bas.pdf},
  issn = {1097-4172},
  journaltitle = {Cell},
  keywords = {Amino Acid Sequence,Catalytic Domain,DNA,DNA Mutational Analysis,DNA: metabolism,Exodeoxyribonucleases,Exodeoxyribonucleases: chemistry,Exodeoxyribonucleases: metabolism,Flap Endonucleases,Flap Endonucleases: chemistry,Flap Endonucleases: metabolism,Humans,Models; Molecular,Molecular Sequence Data,Sequence Alignment,Substrate Specificity},
  number = {2},
  pages = {198-211},
  title = {Human Flap Endonuclease Structures, {{DNA}} Double-Base Flipping, and a Unified Understanding of the {{FEN1}} Superfamily.},
  volume = {145}
}

@article{Tuddenham2012,
  abstract = {Roseolovirus, or human herpesvirus 6 (HHV-6), is a ubiquitous human pathogen infecting over 95\% of the population by the age of 2 years. As with other herpesviruses, reactivation of HHV-6 can present with severe complications in immunocompromised individuals. Recent studies have highlighted the importance of herpesvirus-derived microRNAs (miRNAs) in modulating both cellular and viral gene expression. An initial report which computed the likelihood of various viruses to encode miRNAs did not predict HHV-6 miRNAs. To experimentally screen for small HHV-6-encoded RNAs, we conducted large-scale sequencing of Sup-T-1 cells lytically infected with a laboratory strain of HHV-6B. This revealed an abundant, 60- to 65-nucleotide RNA of unknown function derived from the lytic origin of replication (OriLyt) that gave rise to smaller RNA species of 18 or 19 nucleotides. In addition, we identified four pre-miRNAs whose mature forms accumulated in Argonaute 2. In contrast to the case for other betaherpesviruses, HHV-6B miRNAs are expressed from direct repeat regions (DR(L) and DR(R)) located at either side of the genome. All miRNAs are conserved in the closely related HHV-6A variant, and one of them is a seed ortholog of the human miRNA miR-582-5p. Similar to alphaherpesvirus miRNAs, they are expressed in antisense orientation relative to immediate-early open reading frames (ORFs) and thus have the potential to regulate key viral genes.},
  author = {Tuddenham, Lee and Jung, Jette S and Chane-Woon-Ming, B\'eatrice and D\"olken, Lars and Pfeffer, S\'ebastien},
  date = {2012-02},
  doi = {10/bndqcq},
  eprint = {22114334},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Tuddenham et al. - 2012 - Small RNA deep sequencing identifies microRNAs and.pdf},
  issn = {1098-5514},
  journaltitle = {Journal of virology},
  keywords = {Base Sequence,Cell Line,DNA Primers,Fluorescent Antibody Technique,Genes; Viral,Herpesvirus 6; Human,Herpesvirus 6; Human: genetics,Humans,Likelihood Functions,MicroRNAs,MicroRNAs: genetics,Replication Origin,Reverse Transcriptase Polymerase Chain Reaction,RNA; Untranslated,RNA; Untranslated: genetics},
  number = {3},
  pages = {1638-49},
  title = {Small {{RNA}} Deep Sequencing Identifies {{microRNAs}} and Other Small Noncoding {{RNAs}} from Human Herpesvirus {{6B}}.},
  volume = {86}
}

@article{Turck2011,
  abstract = {Purpose: Accurate early anticipation of long-term irreversible brain damage during the acute phase of patients with aneurysmal subarachnoid hemorrhage (aSAH) remains difficult. Using a combination of clinical scores together with brain injury-related biomarkers (H-FABP, NDKA, UFD1 and S100{$\beta$}), this study aimed at developing a multiparameter prognostic panel to facilitate early outcome prediction following aSAH. Methods: Blood samples of 141 aSAH patients from two separated cohorts (sets of 28 and 113 patients) were prospectively enrolled and analyzed with 14 months of delay. Patients were admitted within 48 h following aSAH onset. A venous blood sample was withdrawn within 12 h after admission. H-FABP, NDKA, UFD1, S100{$\beta$} and troponin I levels were determined using classical immunoassays. The World Federation of Neurological Surgeons (WFNS) at admission and the Glasgow Outcome Score (GOS) at 6 months were evaluated. Results: In the two cohorts, blood concentration of H-FABP, S100{$\beta$} and troponin I at admission significantly predicted unfavorable outcome (GOS 1-2-3). A multivariate analysis identified a six-parameter panel, including WFNS, H-FABP, S100{$\beta$}, troponin I, NDKA and UFD-1; when at least three of these parameters were simultaneously above cutoff values, prediction of unfavorable outcome reached around 70\% sensitivity in both cohorts for 100\% specificity. Conclusion: The use of this panel, including four brain injury-related proteins, one cardiac marker and a clinical score, could be a valuable tool to identify aSAH patients at risk of poor outcome. \textcopyright{} 2009 Copyright jointly hold by Springer and ESICM.},
  author = {Turck, Natacha and Vutskits, Laszlo and Sanchez-Pena, Paola and Robin, Xavier and Hainard, Alexandre and Gex-Fabry, Marianne and Fouda, Catherine and Bassem, Hadiji and Mueller, Markus and Lisacek, Fr\'ed\'erique and Puybasset, Louis and Sanchez, Jean Charles},
  date = {2010},
  doi = {10/d8hzhf},
  file = {/Users/ryan/Documents/Zotero Library/Turck et al. - 2010 - A multiparameter panel method for outcome predicti.pdf},
  issn = {03424642},
  journaltitle = {Intensive Care Medicine},
  keywords = {Aneurysmal subarachnoid hemorrhage,H-FABP,NDKA,Prognosis,S100β},
  number = {1},
  pages = {107-115},
  title = {A Multiparameter Panel Method for Outcome Prediction Following Aneurysmal Subarachnoid Hemorrhage},
  volume = {36}
}

@article{Turner2017,
  author = {Turner, Isaac and Garimella, Kiran V},
  date = {2017},
  doi = {10/ggcxmv},
  file = {/Users/ryan/Documents/Zotero Library/Turner and Garimella - 2017 - Integrating long-range connectivity information in.pdf},
  title = {Integrating Long-Range Connectivity Information into de {{Bruijn}} Graphs}
}

@article{Turro2013,
  abstract = {MOTIVATION: Most methods for estimating differential expression from RNA-seq are based on statistics that compare normalized read counts between treatment classes. Unfortunately, reads are in general too short to be mapped unambiguously to features of interest, such as genes, isoforms or haplotype-specific isoforms. There are methods for estimating expression levels that account for this source of ambiguity. However, the uncertainty is not generally accounted for in downstream analysis of gene expression experiments. Moreover, at the individual transcript level, it can sometimes be too large to allow useful comparisons between treatment groups. RESULTS: In this article we make two proposals that improve the power, specificity and versatility of expression analysis using RNA-seq data. First, we present a Bayesian method for model selection that accounts for read mapping ambiguities using random effects. This polytomous model selection approach can be used to identify many interesting patterns of gene expression and is not confined to detecting differential expression between two groups. For illustration, we use our method to detect imprinting, different types of regulatory divergence in cis and in trans and differential isoform usage, but many other applications are possible. Second, we present a novel collapsing algorithm for grouping transcripts into inferential units that exploits the posterior correlation between transcript expression levels. The aggregate expression levels of these units can be estimated with useful levels of uncertainty. Our algorithm can improve the precision of expression estimates when uncertainty is large with only a small reduction in biological resolution.},
  author = {Turro, Ernest and Astle, William J and Tavar\'e, Simon},
  date = {2014-01-15},
  doi = {10/f5qw93},
  eprint = {24281695},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Turro et al. - 2014 - Flexible analysis of RNA-seq data using mixed effe.pdf;/Users/ryan/Documents/Zotero Library/Turro et al. - 2014 - Flexible analysis of RNA-seq data using mixed effe2.pdf},
  issn = {1460-2059},
  journaltitle = {Bioinformatics},
  number = {2},
  pages = {180-188},
  title = {Flexible Analysis of {{RNA}}-Seq Data Using Mixed Effects Models},
  volume = {30}
}

@article{Ule2006,
  abstract = {Nova proteins are a neuron-specific alternative splicing factors. We have combined bioinformatics, biochemistry and genetics to derive an RNA map describing the rules by which Nova proteins regulate alternative splicing. This map revealed that the position of Nova binding sites (YCAY clusters) in a pre-messenger RNA determines the outcome of splicing. The map correctly predicted Nova's effect to inhibit or enhance exon inclusion, which led us to examine the relationship between the map and Nova's mechanism of action. Nova binding to an exonic YCAY cluster changed the protein complexes assembled on pre-mRNA, blocking U1 snRNP (small nuclear ribonucleoprotein) binding and exon inclusion, whereas Nova binding to an intronic YCAY cluster enhanced spliceosome assembly and exon inclusion. Assays of splicing intermediates of Nova-regulated transcripts in mouse brain revealed that Nova preferentially regulates removal of introns harbouring (or closest to) YCAY clusters. These results define a genome-wide map relating the position of a cis-acting element to its regulation by an RNA binding protein, namely that Nova binding to YCAY clusters results in a local and asymmetric action to regulate spliceosome assembly and alternative splicing in neurons.},
  author = {Ule, Jernej and Stefani, Giovanni and Mele, Aldo and Ruggiu, Matteo and Wang, Xuning and Taneri, Bahar and Gaasterland, Terry and Blencowe, Benjamin J and Darnell, Robert B},
  date = {2006-11-30},
  doi = {10/b33twt},
  eprint = {17065982},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Ule et al. - 2006 - An RNA map predicting Nova-dependent splicing regu.pdf},
  issn = {1476-4687},
  journaltitle = {Nature},
  keywords = {Alternative Splicing,Alternative Splicing: physiology,Animals,Antigens,GABA-A,GABA-A: genetics,Humans,Introns,Mice,Neoplasm,Neoplasm: physiology,Nerve Tissue Proteins,Nerve Tissue Proteins: physiology,Nucleic Acid Conformation,Protein Binding,Receptors,Ribonucleoprotein,RNA,RNA Precursors,RNA Precursors: chemistry,RNA Precursors: metabolism,RNA-Binding Proteins,RNA-Binding Proteins: physiology,RNA: chemistry,RNA: physiology,U1 Small Nuclear,U1 Small Nuclear: antagonists &,U1 Small Nuclear: metabolism},
  number = {7119},
  pages = {580-6},
  title = {An {{RNA}} Map Predicting {{Nova}}-Dependent Splicing Regulation.},
  volume = {444}
}

@article{Urbut2016,
  author = {Urbut, Sarah Margaret and Wang, Gao and Stephens, Matthew},
  date = {2016},
  doi = {10/ggcxmw},
  file = {/Users/ryan/Documents/Zotero Library/Urbut et al. - 2016 - Flexible statistical methods for estimating and te.pdf},
  title = {Flexible Statistical Methods for Estimating and Testing Effects in Genomic Studies with Multiple Conditions}
}

@article{Valenzuela2017,
  abstract = {Solid organ transplantation is a curative therapy for hundreds of thousands of patients with end-stage organ failure. However, long-term outcomes have not improved, and nearly half of transplant recipients will lose their allografts by 10 years after transplant. One of the major challenges facing clinical transplantation is antibody-mediated rejection (AMR) caused by anti-donor HLA antibodies. AMR is highly associated with graft loss, but unfortunately there are few efficacious therapies to prevent and reverse AMR. This Review describes the clinical and histological manifestations of AMR, and discusses the immunopathological mechanisms contributing to antibody-mediated allograft injury as well as current and emerging therapies.},
  author = {Valenzuela, Nicole M. and Reed, Elaine F.},
  date = {2017-06-12},
  doi = {10/gbmrzf},
  file = {/Users/ryan/Documents/Zotero Library/Valenzuela and Reed - 2017 - Antibody-mediated rejection across solid organ tra.pdf},
  issn = {0021-9738},
  journaltitle = {Journal of Clinical Investigation},
  number = {7},
  pages = {2492-2504},
  title = {Antibody-Mediated Rejection across Solid Organ Transplants: Manifestations, Mechanisms, and Therapies},
  volume = {127}
}

@article{VanDeWiel2013,
  abstract = {Next generation sequencing is quickly replacing microarrays as a technique to probe different molecular levels of the cell, such as DNA or RNA. The technology provides higher resolution, while reducing bias. RNA sequencing results in counts of RNA strands. This type of data imposes new statistical challenges. We present a novel, generic approach to model and analyze such data. Our approach aims at large flexibility of the likelihood (count) model and the regression model alike. Hence, a variety of count models is supported, such as the popular NB model, which accounts for overdispersion. In addition, complex, non-balanced designs and random effects are accommodated. Like some other methods, our method provides shrinkage of dispersion-related parameters. However, we extend it by enabling joint shrinkage of parameters, including those for which inference is desired. We argue that this is essential for Bayesian multiplicity correction. Shrinkage is effectuated by empirically estimating priors. We discuss several parametric (mixture) and non-parametric priors and develop procedures to estimate (parameters of) those. Inference is provided by means of local and Bayesian false discovery rates. We illustrate our method on several simulations and two data sets, also to compare it with other methods. Model- and data-based simulations show substantial improvements in the sensitivity at the given specificity. The data motivate the use of the ZI-NB as a powerful alternative to the NB, which results in higher detection rates for low-count data. Finally, compared with other methods, the results on small sample subsets are more reproducible when validated on their large sample complements, illustrating the importance of the type of shrinkage.},
  author = {a Van De Wiel, Mark and Leday, Gwena\"el G R and Pardo, Luba and Rue, H\aa{}vard and Van Der Vaart, Aad W and Van Wieringen, Wessel N},
  date = {2013-01-01},
  doi = {10/ggcxmx},
  eprint = {22988280},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Van De Wiel et al. - 2013 - Bayesian analysis of RNA sequencing data by estima.pdf},
  issn = {1465-4644},
  journaltitle = {Biostatistics},
  keywords = {★,Base Sequence,Bayes Theorem,Computer Simulation,Data Interpretation,Models,Molecular Sequence Data,RNA,RNA: chemistry,RNA: genetics,RNA: methods,Sequence Analysis,Statistical},
  number = {1},
  pages = {113-128},
  title = {Bayesian Analysis of {{RNA}} Sequencing Data by Estimating Multiple Shrinkage Priors},
  volume = {14}
}

@article{VandeWiel2016,
  abstract = {\textcopyright{} 2015 John Wiley \& Sons, Ltd. For many high-dimensional studies, additional information on the variables, like (genomic) annotation or external p-values, is available. In the context of binary and continuous prediction, we develop a method for adaptive group-regularized (logistic) ridge regression, which makes structural use of such 'co-data'. Here, 'groups' refer to a partition of the variables according to the co-data. We derive empirical Bayes estimates of group-specific penalties, which possess several nice properties: (i) They are analytical. (ii) They adapt to the informativeness of the co-data for the data at hand. (iii) Only one global penalty parameter requires tuning by cross-validation. In addition, the method allows use of multiple types of co-data at little extra computational effort. We show that the group-specific penalties may lead to a larger distinction between 'near-zero' and relatively large regression parameters, which facilitates post hoc variable selection. The method, termed GRridge, is implemented in an easy-to-use R-package. It is demonstrated on two cancer genomics studies, which both concern the discrimination of precancerous cervical lesions from normal cervix tissues using methylation microarray data. For both examples, GRridge clearly improves the predictive performances of ordinary logistic ridge regression and the group lasso. In addition, we show that for the second study, the relatively good predictive performance is maintained when selecting only 42 variables.},
  archivePrefix = {arXiv},
  author = {van de Wiel, Mark A. and Lien, Tonje G. and Verlaat, Wina and van Wieringen, Wessel N. and Wilting, Saskia M.},
  date = {2016},
  doi = {10/gdwtm2},
  eprint = {26365903},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/van de Wiel et al. - 2016 - Better prediction by use of co-data Adaptive grou.pdf},
  isbn = {0277-6715},
  issn = {10970258},
  journaltitle = {Statistics in Medicine},
  keywords = {Classification,Empirical Bayes,Logistic ridge regression,Methylation,Random forest,Variable selection},
  number = {3},
  options = {useprefix=true},
  pages = {368-381},
  title = {Better Prediction by Use of Co-Data: {{Adaptive}} Group-Regularized Ridge Regression},
  volume = {35}
}

@article{VanGurp2013,
  abstract = {Priming of random hexamers in cDNA synthesis is known to show sequence bias, but in addition it has been suggested recently that mismatches in random hexamer priming could be a cause of mismatches between the original RNA fragment and observed sequence reads. To explore random hexamer mispriming as a potential source of these errors, we analyzed two independently generated RNA-seq datasets of synthetic ERCC spikes for which the reference is known. First strand cDNA synthesized by random hexamer priming on RNA showed consistent position and nucleotide-specific mismatch errors in the first seven nucleotides. The mismatch errors found in both datasets are consistent in distribution and thermodynamically stable mismatches are more common. This strongly indicates that RNA-DNA mispriming of specific random hexamers causes these errors. Due to their consistency and specificity, mispriming errors can have profound implications for downstream applications if not dealt with properly.},
  author = {van Gurp, Thomas P and McIntyre, Lauren M and Verhoeven, Koen J F},
  date = {2013-01},
  doi = {10/ggcxmz},
  eprint = {24386481},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/van Gurp et al. - 2013 - Consistent errors in first strand cDNA due to rand.pdf},
  issn = {1932-6203},
  journaltitle = {PloS one},
  number = {12},
  options = {useprefix=true},
  pages = {e85583},
  title = {Consistent Errors in First Strand {{cDNA}} Due to Random Hexamer Mispriming.},
  volume = {8}
}

@article{VanHouwelingen2014,
  abstract = {This paper reviews and discusses the role of Empirical Bayes methodology in medical statistics in the last 50 years. It gives some background on the origin of the empirical Bayes approach and its link with the famous Stein estimator. The paper describes the application in four important areas in medical statistics: disease mapping, health care monitoring, meta-analysis, and multiple testing. It ends with a warning that the application of the outcome of an empirical Bayes analysis to the individual "subjects" is a delicate matter that should be handled with prudence and care.},
  author = {van Houwelingen, Hans C.},
  date = {2014-11},
  doi = {10/f2t6s9},
  eprint = {25205521},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/van Houwelingen - 2014 - The role of empirical Bayes methodology as a leadi.pdf},
  isbn = {3130220062},
  issn = {03233847},
  journaltitle = {Biometrical Journal},
  keywords = {Disease mapping,Empirical Bayes,Health care monitoring,Meta-analysis,Multiple testing},
  number = {6},
  options = {useprefix=true},
  pages = {919-932},
  title = {The Role of Empirical {{Bayes}} Methodology as a Leading Principle in Modern Medical Statistics},
  volume = {56}
}

@report{VanIterson2016,
  author = {van Iterson, Maarten M. and van Zwet, Erik W. and Slagboom, P. Eline and Heijmans, Bastiaan T.},
  date = {2016-05-27},
  doi = {10/dd8w},
  file = {/Users/ryan/Documents/Zotero Library/van Iterson et al. - 2016 - Controlling bias and inflation in epigenome- and t.pdf},
  keywords = {biological applications,graphene quantum dots,morphology and size control,review,synthetic methods,因其具有独特的电学和光学性质,在许多方面展现,石墨烯是一种具有网状周期结构的二维碳材料},
  options = {useprefix=true},
  pages = {337-344},
  title = {Controlling Bias and Inflation in Epigenome- and Transcriptome-Wide Association Studies Using the Empirical Null Distribution},
  url = {http://biorxiv.org/lookup/doi/10.1101/055772}
}

@article{VanNieuwerburgh2012,
  abstract = {Standard Illumina mate-paired libraries are constructed from 3- to 5-kb DNA fragments by a blunt-end circularization. Sequencing reads that pass through the junction of the two joined ends of a 3-5-kb DNA fragment are not easy to identify and pose problems during mapping and de novo assembly. Longer read lengths increase the possibility that a read will cross the junction. To solve this problem, we developed a mate-paired protocol for use with Illumina sequencing technology that uses Cre-Lox recombination instead of blunt end circularization. In this method, a LoxP sequence is incorporated at the junction site. This sequence allows screening reads for junctions without using a reference genome. Junction reads can be trimmed or split at the junction. Moreover, the location of the LoxP sequence in the reads distinguishes mate-paired reads from spurious paired-end reads. We tested this new method by preparing and sequencing a mate-paired library with an insert size of 3\,kb from Saccharomyces cerevisiae. We present an analysis of the library quality statistics and a new bio-informatics tool called DeLoxer that can be used to analyze an IlluminaCre-Lox mate-paired data set. We also demonstrate how the resulting data significantly improves a de novo assembly of the S. cerevisiae genome.},
  author = {Van Nieuwerburgh, Filip and Thompson, Ryan C and Ledesma, Jessica and Deforce, Dieter and Gaasterland, Terry and Ordoukhanian, Phillip and Head, Steven R},
  date = {2012-02-01},
  doi = {10/fmzd3r},
  eprint = {22127871},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Van Nieuwerburgh et al. - 2012 - Illumina mate-paired DNA sequencing-library prepar.pdf},
  issn = {1362-4962},
  journaltitle = {Nucleic acids research},
  number = {3},
  pages = {e24},
  title = {Illumina Mate-Paired {{DNA}} Sequencing-Library Preparation Using {{Cre}}-{{Lox}} Recombination.},
  volume = {40}
}

@article{Varemo2013,
  abstract = {Gene set analysis (GSA) is used to elucidate genome-wide data, in particular transcriptome data. A multitude of methods have been proposed for this step of the analysis, and many of them have been compared and evaluated. Unfortunately, there is no consolidated opinion regarding what methods should be preferred, and the variety of available GSA software and implementations pose a difficulty for the end-user who wants to try out different methods. To address this, we have developed the R package Piano that collects a range of GSA methods into the same system, for the benefit of the end-user. Further on we refine the GSA workflow by using modifications of the gene-level statistics. This enables us to divide the resulting gene set P-values into three classes, describing different aspects of gene expression directionality at gene set level. We use our fully implemented workflow to investigate the impact of the individual components of GSA by using microarray and RNA-seq data. The results show that the evaluated methods are globally similar and the major separation correlates well with our defined directionality classes. As a consequence of this, we suggest to use a consensus scoring approach, based on multiple GSA runs. In combination with the directionality classes, this constitutes a more thorough basis for an enriched biological interpretation.},
  author = {V\"aremo, Leif and Nielsen, Jens and Nookaew, Intawat},
  date = {2013-04-01},
  doi = {10/f2zvht},
  eprint = {23444143},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Väremo et al. - 2013 - Enriching the gene set analysis of genome-wide dat.pdf;/Users/ryan/Documents/Zotero Library/Väremo et al. - 2013 - Enriching the gene set analysis of genome-wide dat2.pdf},
  issn = {1362-4962},
  journaltitle = {Nucleic Acids Research},
  keywords = {Data Interpretation,Gene Expression Profiling,Genes,Genomics,Genomics: methods,GSEA,piano,Principal Component Analysis,Saccharomyces cerevisiae,Saccharomyces cerevisiae: genetics,Saccharomyces cerevisiae: metabolism,Software,Statistical},
  number = {8},
  pages = {4378-4391},
  title = {Enriching the Gene Set Analysis of Genome-Wide Data by Incorporating Directionality of Gene Expression and Combining Statistical Hypotheses and Methods},
  volume = {41}
}

@article{Vartanian2009,
  abstract = {Peripheral blood is an accessible and informative source of transcriptomal information for many human disease and pharmacogenomic studies. While there can be significant advantages to analyzing RNA isolated from whole blood, particularly in clinical studies, the preparation of samples for microarray analysis is complicated by the need to minimize artifacts associated with highly abundant globin RNA transcripts. The impact of globin RNA transcripts on expression profiling data can potentially be reduced by using RNA preparation and labeling methods that remove or block globin RNA during the microarray assay. We compared four different methods for preparing microarray hybridization targets from human whole blood collected in PAXGene tubes. Three of the methods utilized the Affymetrix one-cycle cDNA synthesis/in vitro transcription protocol but varied treatment of input RNA as follows: i. no treatment; ii. treatment with GLOBINclear; or iii. treatment with globin PNA oligos. In the fourth method cDNA targets were prepared with the Ovation amplification and labeling system.},
  author = {Vartanian, Kristina and Slottke, Rachel and Johnstone, Timothy and Casale, Amanda and Planck, Stephen R and Choi, Dongseok and Smith, Justine R and Rosenbaum, James T and a Harrington, Christina},
  date = {2009-01},
  doi = {10/fn4mnq},
  eprint = {19123946},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Vartanian et al. - 2009 - Gene expression profiling of whole blood comparis.pdf},
  issn = {1471-2164},
  journaltitle = {BMC genomics},
  keywords = {Freezing,Gene Expression Profiling,Gene Expression Profiling: methods,Globins,Globins: genetics,Humans,NF-kappa B,NF-kappa B: genetics,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,Reproducibility of Results,RNA,RNA; Messenger,RNA; Messenger: blood,RNA: blood,Sensitivity and Specificity},
  pages = {2},
  title = {Gene Expression Profiling of Whole Blood: Comparison of Target Preparation Methods for Accurate and Reproducible Microarray Analysis.},
  volume = {10}
}

@article{Vijay2013,
  abstract = {Transcriptome Shotgun Sequencing (RNA-seq) has been readily embraced by geneticists and molecular ecologists alike. As with all high-throughput technologies, it is critical to understand which analytic strategies are best suited and which parameters may bias the interpretation of the data. Here we use a comprehensive simulation approach to explore how various features of the transcriptome (complexity, degree of polymorphism {$\pi$}, alternative splicing), technological processing (sequencing error {$\epsilon$}, library normalization) and bioinformatic workflow (de novo vs. mapping assembly, reference genome quality) impact transcriptome quality and inference of differential gene expression (DE). We find that transcriptome assembly and gene expression profiling (EdgeR vs. BaySeq software) works well even in the absence of a reference genome and is robust across a broad range of parameters. We advise against library normalization and in most situations advocate mapping assemblies to an annotated genome of a divergent sister clade, which generally outperformed de novo assembly (Trans-Abyss, Trinity, Soapdenovo-Trans). Transcriptome complexity (size, paralogs, alternative splicing isoforms) negatively affected the assembly and DE profiling, whereas the effects of sequencing error and polymorphism were almost negligible. Finally, we highlight the challenge of gene name assignment for de novo assemblies, the importance of mapping strategies and raise awareness of challenges associated with the quality of reference genomes. Overall, our results have significant practical and methodological implications and can provide guidance in the design and analysis of RNA-seq experiments, particularly for organisms where genomic background information is lacking.},
  author = {Vijay, Nagarjun and Poelstra, Jelmer W and K\"unstner, Axel and Wolf, Jochen B W},
  date = {2013-03},
  doi = {10/f2z7w8},
  eprint = {22998089},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Vijay et al. - 2013 - Challenges and strategies in transcriptome assembl.pdf},
  issn = {1365-294X},
  journaltitle = {Molecular ecology},
  keywords = {2012,accepted 11 july 2012,bioinformatics,comparative genomics,differential gene expression,features of the transcriptome,for genetic nonmodel,received 21 march 2012,revision received 13 june,rna-seq,simulation,systems biology,transcriptome assembly},
  number = {3},
  pages = {620-34},
  title = {Challenges and Strategies in Transcriptome Assembly and Differential Gene Expression Quantification. {{A}} Comprehensive in Silico Assessment of {{RNA}}-Seq Experiments.},
  volume = {22}
}

@article{Volkmar2012,
  abstract = {In addition to genetic predisposition, environmental and lifestyle factors contribute to the pathogenesis of type 2 diabetes (T2D). Epigenetic changes may provide the link for translating environmental exposures into pathological mechanisms. In this study, we performed the first comprehensive DNA methylation profiling in pancreatic islets from T2D and non-diabetic donors. We uncovered 276 CpG loci affiliated to promoters of 254 genes displaying significant differential DNA methylation in diabetic islets. These methylation changes were not present in blood cells from T2D individuals nor were they experimentally induced in non-diabetic islets by exposure to high glucose. For a subgroup of the differentially methylated genes, concordant transcriptional changes were present. Functional annotation of the aberrantly methylated genes and RNAi experiments highlighted pathways implicated in {$\beta$}-cell survival and function; some are implicated in cellular dysfunction while others facilitate adaptation to stressors. Together, our findings offer new insights into the intricate mechanisms of T2D pathogenesis, underscore the important involvement of epigenetic dysregulation in diabetic islets and may advance our understanding of T2D aetiology. \textcopyright{} 2012 European Molecular Biology Organization.},
  author = {Volkmar, Michael and Dedeurwaerder, Sarah and Cunha, Daniel A. and Ndlovu, Matladi N. and Defrance, Matthieu and Deplus, Rachel and Calonne, Emilie and Volkmar, Ute and Igoillo-Esteve, Mariana and Naamane, Najib and Del Guerra, Silvia and Masini, Matilde and Bugliani, Marco and Marchetti, Piero and Cnop, Miriam and Eizirik, Decio L. and Fuks, Fran{\c c}ois},
  date = {2012},
  doi = {10/fzd945},
  file = {/Users/ryan/Documents/Zotero Library/Volkmar et al. - 2012 - DNA methylation profiling identifies epigenetic dy.pdf},
  issn = {02614189},
  journaltitle = {EMBO Journal},
  keywords = {DNA methylation,pancreatic islets,type 2 diabetes},
  number = {6},
  pages = {1405-1426},
  title = {{{DNA}} Methylation Profiling Identifies Epigenetic Dysregulation in Pancreatic Islets from Type 2 Diabetic Patients},
  volume = {31}
}

@article{Voss2011,
  abstract = {The glucocorticoid receptor (GR), like other eukaryotic transcription factors, regulates gene expression by interacting with chromatinized DNA response elements. Photobleaching experiments in living cells indicate that receptors transiently interact with DNA on the time scale of seconds and predict that the response elements may be sparsely occupied on average. Here, we show that the binding of one receptor at the glucocorticoid response element (GRE) does not reduce the steady-state binding of another receptor variant to the same GRE. Mathematical simulations reproduce this noncompetitive state using short GR/GRE residency times and relatively long times between DNA binding events. At many genomic sites where GR binding causes increased chromatin accessibility, concurrent steady-state binding levels for the variant receptor are actually increased, a phenomenon termed assisted loading. Temporally sparse transcription factor-DNA interactions induce local chromatin reorganization, resulting in transient access for binding of secondary regulatory factors.},
  author = {Voss, Ty C and Schiltz, R Louis and Sung, Myong-Hee and Yen, Paul M and a Stamatoyannopoulos, John and Biddie, Simon C and a Johnson, Thomas and Miranda, Tina B and John, Sam and Hager, Gordon L},
  date = {2011-08-19},
  doi = {10/cghfns},
  eprint = {21835447},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Voss et al. - 2011 - Dynamic exchange at regulatory elements during chr.pdf},
  issn = {1097-4172},
  journaltitle = {Cell},
  keywords = {Adenosine Triphosphate,Adenosine Triphosphate: metabolism,Animals,Cell Line; Tumor,Chromatin Assembly and Disassembly,Mammary Tumor Virus; Mouse,Mice,Models; Biological,Monte Carlo Method,Nucleosomes,Nucleosomes: metabolism,Receptors; Estrogen,Receptors; Estrogen: metabolism,Receptors; Glucocorticoid,Receptors; Glucocorticoid: metabolism,Regulatory Sequences; Nucleic Acid,Response Elements,Transcription Factors,Transcription Factors: metabolism},
  number = {4},
  pages = {544-54},
  title = {Dynamic Exchange at Regulatory Elements during Chromatin Remodeling Underlies Assisted Loading Mechanism.},
  volume = {146}
}

@article{wanCEDERAccurateDetection2012,
  abstract = {RNA-Seq is widely used in transcriptome studies, and the detection of differentially expressed genes (DEGs) between two classes of individuals, e.g., cases versus controls, using RNA-Seq is of fundamental importance. Many statistical methods for DEG detection based on RNA-Seq data have been developed and most of them are based on the read counts mapped to individual genes. On the other hand, genes are composed of exons and the distribution of reads for the different exons can be heterogeneous. We hypothesize that the detection accuracy of differentially expressed genes can be increased by analyzing individual exons within a gene and then combining the results of the exons. We therefore developed a novel program, termed CEDER, to accurately detect DEGs by combining the significance of the exons. CEDER first tests for differentially expressed exons yielding a p-value for each, and then gives a score indicating the potential for a gene to be differentially expressed by integrating the p-values of the exons in the gene. We showed that CEDER can significantly increase the accuracy of existing methods for detecting DEGs on two benchmark RNA-Seq data sets and simulated datasets.},
  author = {Wan, Lin and Sun, Fengzhu},
  date = {2012-09},
  doi = {10/ggcxjd},
  file = {/Users/ryan/Documents/Zotero Library/Wan and Sun - 2012 - CEDER Accurate Detection of Differentially Expres.pdf;/Users/ryan/Zotero/storage/ATAZ4BQY/6205734.html},
  ids = {Wan2012},
  issn = {1545-5963, 1557-9964, 2374-0043},
  journaltitle = {IEEE/ACM Transactions on Computational Biology and Bioinformatics},
  keywords = {Accuracy,Bioinformatics,CEDER,combined p-value statistic.,DEG detection,differentially expressed gene,exons,Exons,gene expression,Gene Expression Profiling,genetics,Genomics,high-throughput sequencing,Image edge detection,molecular biophysics,Oligonucleotide Array Sequence Analysis,RNA,RNA-Seq,RNA-Seq data,Sequence Analysis; RNA,simulated datasets,Software,Standards,Statistical analysis,transcriptome,Transcriptome},
  number = {5},
  pages = {1281-1292},
  shorttitle = {{{CEDER}}},
  title = {{{CEDER}}: {{Accurate Detection}} of {{Differentially Expressed Genes}} by {{Combining Significance}} of {{Exons Using RNA}}-{{Seq}}},
  volume = {9}
}

@report{Wang2011,
  author = {Wang, Liguo},
  date = {2011-11-10},
  file = {/Users/ryan/Documents/Zotero Library/Wang - 2011 - The EVER-seq manual.pdf},
  institution = {{Baylor College of Medicine}},
  keywords = {⛔ No DOI found},
  title = {The {{EVER}}-Seq Manual}
}

@article{Wang2011a,
  author = {Wang, Kevin C and Yang, Yul W and Liu, Bo and Sanyal, Amartya and Corces-Zimmerman, Ryan and Chen, Yong and Lajoie, Bryan R and Protacio, Angeline and Flynn, Ryan A and Gupta, Rajnish A and Wysocka, Joanna and Lei, Ming and Dekker, Job and Helms, Jill A and Chang, Howard Y},
  date = {2011-04-07},
  doi = {10/fm3jds},
  file = {/Users/ryan/Documents/Zotero Library/Wang et al. - 2011 - A long noncoding RNA maintains active chromatin to.pdf},
  issn = {0028-0836},
  journaltitle = {Nature},
  number = {7341},
  pages = {120-124},
  title = {A Long Noncoding {{RNA}} Maintains Active Chromatin to Coordinate Homeotic Gene Expression},
  volume = {472}
}

@article{Wang2012,
  abstract = {MCScan is an algorithm able to scan multiple genomes or subgenomes in order to identify putative homologous chromosomal regions, and align these regions using genes as anchors. The MCScanX toolkit implements an adjusted MCScan algorithm for detection of synteny and collinearity that extends the original software by incorporating 14 utility programs for visualization of results and additional downstream analyses. Applications of MCScanX to several sequenced plant genomes and gene families are shown as examples. MCScanX can be used to effectively analyze chromosome structural changes, and reveal the history of gene family expansions that might contribute to the adaptation of lineages and taxa. An integrated view of various modes of gene duplication can supplement the traditional gene tree analysis in specific families. The source code and documentation of MCScanX are freely available at http://chibba.pgml.uga.edu/mcscan2/.},
  author = {Wang, Yupeng and Tang, Haibao and Debarry, Jeremy D and Tan, Xu and Li, Jingping and Wang, Xiyin and Lee, Tae-ho and Jin, Huizhe and Marler, Barry and Guo, Hui and Kissinger, Jessica C and Paterson, Andrew H},
  date = {2012-04},
  doi = {10/fzn3xm},
  eprint = {22217600},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Wang et al. - 2012 - MCScanX a toolkit for detection and evolutionary .pdf},
  issn = {1362-4962},
  journaltitle = {Nucleic acids research},
  keywords = {Algorithms,Angiosperms,Angiosperms: genetics,Evolution; Molecular,Gene Duplication,Gene Order,Genome; Plant,Genomics,Multigene Family,Software,Synteny},
  number = {7},
  pages = {e49},
  title = {{{MCScanX}}: A Toolkit for Detection and Evolutionary Analysis of Gene Synteny and Collinearity.},
  volume = {40}
}

@article{Wang2014,
  abstract = {Histone modification (HM) patterns are widely applied to identify transcription factor binding regions (TFBRs). However, how frequently the TFBRs overlap with genomic regions enriched with certain types of HMs and which HM marker is more effective to pinpoint the TFBRs have not been systematically investigated. To address these problems, we studied 149 transcription factor (TF) ChIP-seq datasets and 33 HM ChIP-seq datasets in three cell lines. We found that on average about 90\% of the TFBRs overlap with the H3K4me2-enriched regions. Moreover, the H3K4me2-enriched regions with stronger signals of H3K4me2 enrichment more likely overlap with the TFBRs than those with weaker signals. In addition, we showed that the H3K4me2-enriched regions together with the H3K27ac-enriched regions can greatly reduce false positive predictions of the TFBRs. Our study sheds light on the comprehensive discovery of the TFBRs using the HeK4me-enriched regions, especially when no good antibody to a TF exists.},
  author = {Wang, Ying and Li, Xiaoman and Hu, Haiyan},
  date = {2014-02-12},
  doi = {10/f5zhjr},
  eprint = {24530516},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Wang et al. - 2014 - H3K4me2 reliably defines transcription factor bind.pdf},
  issn = {1089-8646},
  journaltitle = {Genomics},
  keywords = {ChIP-seq,H3K4me2,Histone modification,Transcription factor binding regions},
  title = {{{H3K4me2}} Reliably Defines Transcription Factor Binding Regions in Different Cells.}
}

@article{Wang2014a,
  author = {Wang, Pin and Xue, Yiquan and Han, Yanmei and Lin, Li and Wu, Cong and Xu, Sheng and Jiang, Zhengping and Xu, Junfang and Liu, Qiuyan and Cao, Xuetao},
  date = {2014},
  file = {/Users/ryan/Documents/Zotero Library/Wang et al. - 2014 - The STAT3-Binding Long Noncoding RNA lnc-DC Contro.pdf},
  issue = {April},
  pages = {310-313},
  title = {The {{STAT3}}-{{Binding Long Noncoding RNA}} Lnc-{{DC Controls Human Dendritic Cell Differentiation}}},
  volume = {344}
}

@article{Wang2016,
  author = {Wang, Liguo and Nie, Jinfu and Sicotte, Hugues and Li, Ying and Eckel-Passow, Jeanette E. and Dasari, Surendra and Vedell, Peter T. and Barman, Poulami and Wang, Liewei and Weinshiboum, Richard and Jen, Jin and Huang, Haojie and Kohli, Manish and Kocher, Jean-Pierre A.},
  date = {2016},
  doi = {10/f8s4hp},
  file = {/Users/ryan/Documents/Zotero Library/Wang et al. - 2016 - Measure transcript integrity using RNA-seq data.pdf},
  issn = {1471-2105},
  journaltitle = {BMC Bioinformatics},
  keywords = {gene expression,rna-seq quality control,tin,transcript integrity number,Transcript integrity number;TIN;RNA-seq quality co},
  number = {1},
  pages = {58},
  title = {Measure Transcript Integrity Using {{RNA}}-Seq Data},
  volume = {17}
}

@article{Warden2013,
  abstract = {BD-Func (BiDirectional FUNCtional enrichment) is an algorithm that calculates functional enrichment by comparing lists of pre-defined genes that are known to be activated versus inhibited in a pathway or by a regulatory molecule. This paper shows that BD-Func can correctly predict cell line alternations and patient characteristics with accuracy comparable to popular algorithms, with a significantly faster run-time. BD-Func can compare scores for individual samples across multiple groups as well as provide predictive statistics and receiver operating characteristic (ROC) plots to quantify the accuracy of the signature associated with a binary phenotypic variable. BD-Func facilitates collaboration and reproducibility by encouraging users to share novel molecular signatures in the BD-Func discussion group, which is where the novel progesterone receptor and LBH589 signatures from this paper can be found. The novel LBH589 signature presented in this paper also serves as a case study showing how a custom signature using cell line data can accurately predict activity in vivo. This software is available to download at https://sourceforge.net/projects/bdfunc/.},
  author = {Warden, Charles D and Kanaya, Noriko and Chen, Shiuan and Yuan, Yate-Ching},
  date = {2013-01},
  doi = {10/ggcxm2},
  eprint = {24058887},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Warden et al. - 2013 - BD-Func a streamlined algorithm for predicting ac.pdf},
  issn = {2167-8359},
  journaltitle = {PeerJ},
  pages = {e159},
  title = {{{BD}}-{{Func}}: A Streamlined Algorithm for Predicting Activation and Inhibition of Pathways.},
  volume = {1}
}

@article{Wasserstein2016a,
  author = {Wasserstein, Ronald L. and Lazar, Nicole A.},
  date = {2016-04-02},
  doi = {10/bc4d},
  file = {/Users/ryan/Documents/Zotero Library/Wasserstein and Lazar - 2016 - The ASA Statement on p -Values Context, Process, .pdf},
  issn = {0003-1305},
  journaltitle = {The American Statistician},
  number = {2},
  pages = {129-133},
  title = {The {{ASA Statement}} on p -{{Values}}: {{Context}}, {{Process}}, and {{Purpose}}},
  volume = {70}
}

@article{Weng2006,
  abstract = {MOTIVATION: In microarray gene expression studies, the number of replicated microarrays is usually small because of cost and sample availability, resulting in unreliable variance estimation and thus unreliable statistical hypothesis tests. The unreliable variance estimation is further complicated by the fact that the technology-specific variance is intrinsically intensity-dependent.

RESULTS: The Rosetta error model captures the variance-intensity relationship for various types of microarray technologies, such as single-color arrays and two-color arrays. This error model conservatively estimates intensity error and uses this value to stabilize the variance estimation. We present two commonly used error models: the intensity error-model for single-color microarrays and the ratio error model for two-color microarrays or ratios built from two single-color arrays. We present examples to demonstrate the strength of our error models in improving statistical power of microarray data analysis, particularly, in increasing expression detection sensitivity and specificity when the number of replicates is limited.},
  author = {Weng, Lee and Dai, Hongyue and Zhan, Yihui and He, Yudong and Stepaniants, Sergey B and Bassett, Douglas E},
  date = {2006-05-01},
  doi = {10/fqgtf7},
  eprint = {16522673},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Weng et al. - 2006 - Rosetta error model for gene expression analysis..pdf},
  issn = {1367-4803},
  journaltitle = {Bioinformatics (Oxford, England)},
  keywords = {Algorithms,Analysis of Variance,Computer Simulation,Data Interpretation; Statistical,Gene Expression,Gene Expression Profiling,Gene Expression Profiling: methods,Gene Expression: physiology,Genetic Variation,Models; Genetic,Models; Statistical,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,Reproducibility of Results,Sensitivity and Specificity},
  number = {9},
  pages = {1111-21},
  title = {Rosetta Error Model for Gene Expression Analysis.},
  volume = {22}
}

@article{Weng2012,
  abstract = {How the immune system remembers a previous encounter with a pathogen and responds more efficiently to a subsequent encounter has been one of the central enigmas for immunologists for over a century. The identification of pathogen-specific memory lymphocytes that arise after an infection provided a cellular basis for immunological memory. But the molecular mechanisms of immunological memory remain only partially understood. The emerging evidence suggests that epigenetic changes have a key role in controlling the distinct transcriptional profiles of memory lymphocytes and thus in shaping their function. In this Review, we summarize the recent progress that has been made in assessing the differential gene expression and chromatin modifications in memory CD4(+) and CD8(+) T cells, and we present our current understanding of the molecular basis of memory T cell function.},
  author = {Weng, Nan-Ping and Araki, Yasuto and Subedi, Kalpana},
  date = {2012-03-16},
  doi = {10/ggcxm3},
  eprint = {22421787},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Weng et al. - 2012 - The molecular basis of the memory T cell response.pdf},
  issn = {1474-1741},
  journaltitle = {Nature reviews. Immunology},
  number = {4},
  pages = {306-315},
  title = {The Molecular Basis of the Memory {{T}} Cell Response: Differential Gene Expression and Its Epigenetic Regulation.},
  volume = {12}
}

@article{Wesche2004,
  abstract = {We estimate DNA sequence error rates in Genbank records containing protein-coding and non-coding DNA sequences by comparing sequences of the inbred mouse strain C57BL/6J, sequenced as part of the mouse genome project and independently by other laboratories. C57BL/6J was produced by more than 100 generations of brother-sister mating, and can be assumed to be virtually free of residual polymorphism and mutational variation, so differences between independent sequences can be attributed to error. The estimated single nucleotide error rate for coding DNA is 0.10\% (SE 0.012\%), which is substantially lower than previous estimates for error rates in Genbank accessions. The estimated single nucleotide error rate for intronic DNA sequences (0.22\%; SE 0.051\%) is significantly higher than the rate for coding DNA. Since error rates for the mouse genome sequence are very low, the vast majority of the errors we detected are likely to be in individual Genbank accessions. The frequency of insertion-deletion (indel) errors in non-coding DNA approaches that of single nucleotide errors in non-coding DNA, whereas indel errors are uncommon in coding sequences.},
  author = {Wesche, Philipp L and Gaffney, Daniel J and Keightley, Peter D},
  date = {2004},
  doi = {10/bcr56g},
  journaltitle = {DNA Sequence},
  keywords = {\#nosource},
  number = {5-6},
  pages = {362-364},
  title = {{{DNA Sequence Error Rates}} in {{Genbank Records Estimated}} Using the {{Mouse Genome}} as a {{Reference}}},
  volume = {15}
}

@article{Wesolowski2013,
  abstract = {Transcriptome-based biosensors are expected to have a large impact on the future of biotechnology. However, a central aspect of transcriptomics is differential expression analysis, where, currently, deep RNA sequencing (RNA-seq) has the potential to replace the microarray as the standard assay for RNA quantification. Our contributions here to RNA-seq differential expression analysis are two-fold. First, given the high cost of an RNA-seq run, biological replicates are rare, and therefore, information sharing across genes to obtain variance estimates is crucial.  To handle such information sharing in a rigorous manner, we propose an hierarchical, empirical Bayes approach (R-EBSeq) that combines the Cufflinks model for generating relative transcript abundance measurements, known as FPKM (fragments per kilobase of transcript length per million mapped reads) with the EBArrays framework, which was previously developed for empirical Bayes analysis of microarray data. A desirable feature of R-EBSeq is easy-to-implement analysis of more than pairwise comparisons, as we illustrate with experimental data. Secondly, we develop the standard RNA-seq test data set, on the level of reads, where 79 transcripts are artificially differentially expressed and, therefore, explicitly known. This test data set allows us to compare the performance, in terms of the true discovery rate, of R-EBSeq to three other widely used RNAseq data analysis packages: Cuffdiff, DEseq and BaySeq. Our analysis indicates that DESeq identifies the first half of the differentially expressed transcripts well, but then is outperformed by Cuffdiff and R-EBSeq. Cuffdiff and R-EBSeq are the two top performers. Thus, R-EBSeq offers good performance, while allowing flexible and rigorous comparison of multiple biological conditions.},
  author = {Wesolowski, Sergiusz and Birtwistle, Marc and Rempala, Grzegorz},
  date = {2013-06-28},
  doi = {10/gb87p4},
  file = {/Users/ryan/Documents/Zotero Library/Wesolowski et al. - 2013 - A Comparison of Methods for RNA-Seq Differential E.pdf},
  issn = {2079-6374},
  journaltitle = {Biosensors},
  keywords = {"next-generation sequencing,empirical Bayes,gene expression data"},
  number = {3},
  pages = {238-258},
  title = {A {{Comparison}} of {{Methods}} for {{RNA}}-{{Seq Differential Expression Analysis}} and a {{New Empirical Bayes Approach}}},
  volume = {3}
}

@article{Wickersheim2013,
  abstract = {A large number of methods are available to deplete ribosomal RNA reads from high-throughput RNA sequencing experiments. Such methods are critical for sequencing Drosophila small RNAs between 20 and 30 nucleotides because size selection is not typically sufficient to exclude the highly abundant class of 30 nucleotide 2S rRNA. Here we demonstrate that pre-annealing terminator oligos complimentary to Drosophila 2S rRNA prior to 5' adapter ligation and reverse transcription efficiently depletes 2S rRNA sequences from the sequencing reaction in a simple and inexpensive way. This depletion is highly specific and is achieved with minimal perturbation of miRNA and piRNA profiles.},
  author = {Wickersheim, Michelle L and Blumenstiel, Justin P},
  date = {2013-11},
  doi = {10/ggcxm4},
  eprint = {24215643},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Wickersheim and Blumenstiel - 2013 - Terminator oligo blocking efficiently eliminates r.pdf},
  issn = {1940-9818},
  journaltitle = {BioTechniques},
  keywords = {2s rrna,article is,available at www,biotechniques,com,drosophila,mirna,pirna,small rna sequencing,supplementary material for this},
  number = {5},
  pages = {269-72},
  title = {Terminator Oligo Blocking Efficiently Eliminates {{rRNA}} from {{Drosophila}} Small {{RNA}} Sequencing Libraries.},
  volume = {55}
}

@article{Wickham2011,
  abstract = {We propose a new framework for visualising tables of counts, proportions and probabilities. We call our framework product plots, alluding to the computation of area as a product of height and width, and the statistical concept of generating a joint distribution from the product of conditional and marginal distributions. The framework, with extensions, is sufficient to encompass over 20 visualisations previously described in fields of statistical graphics and infovis, including bar charts, mosaic plots, treemaps, equal area plots and fluctuation diagrams.},
  author = {Wickham, Hadley and Hofmann, Heike},
  date = {2011-12},
  doi = {10/b2pc59},
  eprint = {22034341},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Wickham and Hofmann - 2011 - Product plots..pdf},
  issn = {1941-0506},
  journaltitle = {IEEE transactions on visualization and computer graphics},
  number = {12},
  pages = {2223-30},
  title = {Product Plots.},
  volume = {17}
}

@article{wickhamVisualizingStatisticalModels2015,
  abstract = {Visualization can help in model building, diagnosis, and in developing an understanding about how a model summarizes data. This paper proposes three strategies for visualizing statistical models: (i) display the model in the data space, (ii) look at all members of a collection, and (iii) explore the process of model fitting, not just the end result. Each strategy is accompanied by examples, including manova, classification algorithms, hierarchical clustering, ensembles of linear models, projection pursuit, self-organizing maps, and neural networks.},
  author = {Wickham, Hadley and Cook, Dianne and Hofmann, Heike},
  date = {2015},
  doi = {10/gdf33p},
  file = {/Users/ryan/Documents/Zotero Library/Wickham et al. - 2015 - Visualizing statistical models Removing the blind.pdf;/Users/ryan/Zotero/storage/UN9JPBMY/sam.html},
  issn = {1932-1872},
  journaltitle = {Statistical Analysis and Data Mining: The ASA Data Science Journal},
  keywords = {classification,data mining,exploratory data analysis,high-dimensional data,model visualization},
  langid = {english},
  number = {4},
  pages = {203-225},
  shorttitle = {Visualizing Statistical Models},
  title = {Visualizing Statistical Models: {{Removing}} the Blindfold},
  volume = {8}
}

@article{Wilbanks2010,
  abstract = {Next-generation DNA sequencing coupled with chromatin immunoprecipitation (ChIP-seq) is revolutionizing our ability to interrogate whole genome protein-DNA interactions. Identification of protein binding sites from ChIP-seq data has required novel computational tools, distinct from those used for the analysis of ChIP-Chip experiments. The growing popularity of ChIP-seq spurred the development of many different analytical programs (at last count, we noted 31 open source methods), each with some purported advantage. Given that the literature is dense and empirical benchmarking challenging, selecting an appropriate method for ChIP-seq analysis has become a daunting task. Herein we compare the performance of eleven different peak calling programs on common empirical, transcription factor datasets and measure their sensitivity, accuracy and usability. Our analysis provides an unbiased critical assessment of available technologies, and should assist researchers in choosing a suitable tool for handling ChIP-seq data.},
  author = {Wilbanks, Elizabeth G and Facciotti, Marc T},
  date = {2010-01},
  doi = {10/c83wr4},
  eprint = {20628599},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Wilbanks and Facciotti - 2010 - Evaluation of algorithm performance in ChIP-seq pe.pdf},
  issn = {1932-6203},
  journaltitle = {PloS one},
  keywords = {Algorithms,Chromatin Immunoprecipitation,Chromatin Immunoprecipitation: methods,Sequence Analysis; DNA,Sequence Analysis; DNA: methods},
  number = {7},
  pages = {e11471},
  title = {Evaluation of Algorithm Performance in {{ChIP}}-Seq Peak Detection.},
  volume = {5}
}

@article{Wilkinson2006,
  abstract = {Studies suggest that surveillance or protocol biopsies that are performed during the first year after kidney transplantation may be clinically useful in identifying early acute rejection or chronic allograft nephropathy at a point when they may be amenable to treatment. Although the benefit of this approach has yet to be evaluated in large, multicenter, prospective trials, numerous studies suggest that implementation of protocol biopsies may improve long-term graft function. In particular, a number of reports suggest that detection of chronic allograft nephropathy in early protocol biopsies is predictive of subsequent graft function and loss and that early treatment may have a dramatic effect on the outcome of the graft. Protocol biopsies also have the potential to be of great value in high-risk patients, such as those with delayed graft function, by allowing for early intervention for acute rejection. Furthermore, the procedure seems to be relatively straightforward and safe. Nevertheless, paucity of data has meant that clear proof of a benefit of early treatment of subclinical rejection and chronic allograft nephropathy detected by protocol biopsy is lacking. Moreover, the optimal timing of protocol biopsies and reliable methods to quantify the histologic changes observed in biopsy specimens have yet to be determined. This review discusses the pros and cons of protocol biopsies and considers the place of this procedure in the routine treatment of kidney transplant patients.},
  author = {Wilkinson, Alan},
  date = {2006},
  doi = {10/cq7cjq},
  file = {/Users/ryan/Documents/Zotero Library/Wilkinson - 2006 - Protocol transplant biopsies are they really need.pdf},
  issn = {1555905X},
  journaltitle = {Clinical journal of the American Society of Nephrology : CJASN},
  number = {1},
  pages = {130-137},
  title = {Protocol Transplant Biopsies: Are They Really Needed?},
  volume = {1}
}

@article{Williamson2011,
  abstract = {Processive reactions, such as transcription or translation, often proceed through distinct initiation and elongation phases. The processive formation of polymeric ubiquitin chains can accordingly be catalyzed by specialized initiating and elongating E2 enzymes, but the functional significance for this division of labor has remained unclear. Here, we have identified sequence motifs in several substrates of the anaphase-promoting complex (APC/C) that are required for efficient chain initiation by its E2 Ube2C. Differences in the quality and accessibility of these chain initiation motifs can determine the rate of a substrate's degradation without affecting its affinity for the APC/C, a mechanism used by the APC/C to control the timing of substrate proteolysis during the cell cycle. Based on our results, we propose that initiation motifs and their cognate E2s allow E3 enzymes to exert precise temporal control over substrate degradation.},
  author = {Williamson, Adam and Banerjee, Sudeep and Zhu, Xining and Philipp, Isabelle and Iavarone, Anthony T and Rape, Michael},
  date = {2011-06-24},
  doi = {10/bgq94g},
  eprint = {21700221},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Williamson et al. - 2011 - Regulation of ubiquitin chain initiation to contro.pdf},
  issn = {1097-4164},
  journaltitle = {Molecular cell},
  keywords = {HEK293 Cells,Humans,Time Factors,Ubiquitin,Ubiquitin-Conjugating Enzymes,Ubiquitin-Conjugating Enzymes: metabolism,Ubiquitin-Protein Ligase Complexes,Ubiquitin-Protein Ligase Complexes: metabolism,Ubiquitin-Protein Ligases,Ubiquitin-Protein Ligases: metabolism,Ubiquitin: metabolism},
  number = {6},
  pages = {744-57},
  title = {Regulation of Ubiquitin Chain Initiation to Control the Timing of Substrate Degradation.},
  volume = {42}
}

@online{Wilson2013,
  author = {Wilson, Richard K and Warren, Wesley},
  date = {2013},
  keywords = {\#nosource},
  title = {Macaca {{Fascicularis}} (Cynomolgus Macaque) {{Sequence Assembly}}},
  url = {http://www.ncbi.nlm.nih.gov/assembly/GCF_000364345.1}
}

@article{Wilson2018,
  abstract = {Analysis of ``big data'' frequently involves statistical comparison of millions of competing hypotheses to discover hidden processes underlying observed patterns of data, for example, in the search for genetic determinants of disease in genome-wide association studies (GWAS). Controlling the familywise error rate (FWER) is considered the strongest protection against false positives but makes it difficult to reach the multiple testing-corrected significance threshold. Here, I introduce the harmonic mean p-value (HMP), which controls the FWER while greatly improving statistical power by combining dependent tests using generalized central limit theorem. I show that the HMP effortlessly combines information to detect statistically significant signals among groups of individually nonsignificant hypotheses in examples of a human GWAS for neuroticism and a joint human\textendash{}pathogen GWAS for hepatitis C viral load. The HMP simultaneously tests all ways to group hypotheses, allowing the smallest groups of hypotheses that retain significance to be sought. The power of the HMP to detect significant hypothesis groups is greater than the power of the Benjamini\textendash{}Hochberg procedure to detect significant hypotheses, although the latter only controls the weaker false discovery rate (FDR). The HMP has broad implications for the analysis of large datasets, because it enhances the potential for scientific discovery.},
  author = {Wilson, Daniel J.},
  date = {2019},
  doi = {10/gft4kn},
  eprint = {30610179},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Wilson - 2019 - The harmonic mean p-value for combining dependent .pdf},
  issn = {10916490},
  journaltitle = {Proceedings of the National Academy of Sciences of the United States of America},
  keywords = {Big data,False positives,Model averaging,Multiple testing,P-values},
  number = {4},
  pages = {1195-1200},
  title = {The Harmonic Mean P-Value for Combining Dependent Tests},
  volume = {116}
}

@article{Winn2010,
  abstract = {The use of mouse blood as a model for human blood is often considered in the development of clinically relevant, gene expression-based disease biomarkers. However, the ability to derive biologically meaningful insights from microarray-based gene expression patterns in mouse whole blood, as in human whole blood, is hindered by high levels of globin mRNA. In order to characterize the effects of globin reduction on gene expression of peripheral mouse blood, we performed gene set enrichment analysis on genes identified as expressed in blood via microarray-based genome-wide transcriptome analysis. Depletion of globin mRNA enhanced the quality of microarray data as shown by improved gene expression detection and increased sensitivity. Compared to genes expressed in whole blood, genes detected as expressed in blood following globin reduction were enriched for low abundance transcripts implicated in many biological pathways, including development, g-protein signaling, and immune response. Broadly, globin reduction resulted in improved detection of expressed genes that serve as molecular binding proteins and enzymes in cellular metabolism, intracellular transport/localization, transcription, and translation, as well as genes that potentially could act as biomarkers for diseases such as schizophrenia. These significantly enriched pathways overlap considerably with those identified in globin-reduced human blood suggesting that globin-reduced mouse blood gene expression studies may be useful for identifying genes relevant to human disease. Overall, the results of this investigation provide a better understanding of the impact of reducing globin transcripts in mouse blood and highlight the potential of microarray-based, globin-reduced, mouse blood gene expression studies in biomarker development.},
  author = {Winn, ME and Zapala, MA and Hovatta, Iiris},
  date = {2010-06},
  doi = {10/bjjwht},
  isbn = {0033501092},
  journaltitle = {Mammalian \ldots{}},
  keywords = {\#nosource},
  number = {5-6},
  pages = {268-75},
  title = {The Effects of Globin on Microarray-Based Gene Expression Analysis of Mouse Blood},
  volume = {21}
}

@article{winterLinearModelsLinear2013,
  abstract = {This text is a conceptual introduction to mixed effects modeling with linguistic applications, using the R programming environment. The reader is introduced to linear modeling and assumptions, as well as to mixed effects/multilevel modeling, including a discussion of random intercepts, random slopes and likelihood ratio tests. The example used throughout the text focuses on the phonetic analysis of voice pitch data.},
  archivePrefix = {arXiv},
  author = {Winter, Bodo},
  date = {2013-08-26},
  eprint = {1308.5499},
  eprinttype = {arxiv},
  file = {/Users/ryan/Documents/Zotero Library/Winter - 2013 - Linear models and linear mixed effects models in R.pdf;/Users/ryan/Zotero/storage/TJYIP5NF/1308.html},
  keywords = {⛔ No DOI found,Computer Science - Computation and Language},
  primaryClass = {cs},
  title = {Linear Models and Linear Mixed Effects Models in {{R}} with Linguistic Applications},
  url = {http://arxiv.org/abs/1308.5499},
  urldate = {2019-11-14}
}

@article{Witten2011,
  archivePrefix = {arXiv},
  author = {Witten, Daniela M.},
  date = {2011-12},
  doi = {10/fzgwgs},
  eprint = {1202.6201v1},
  eprinttype = {arxiv},
  file = {/Users/ryan/Documents/Zotero Library/Witten - 2011 - Classification and clustering of sequencing data u.pdf},
  issn = {1932-6157},
  journaltitle = {The Annals of Applied Statistics},
  keywords = {Classification; clustering; genomics; gene express},
  number = {4},
  pages = {2493-2518},
  title = {Classification and Clustering of Sequencing Data Using a {{Poisson}} Model},
  volume = {5}
}

@article{Wu2010a,
  abstract = {A gene set test is a differential expression analysis in which a P-value is assigned to a set of genes as a unit. Gene set tests are valuable for increasing statistical power, organizing and interpreting results and for relating expression patterns across different experiments. Existing methods are based on permutation. Methods that rely on permutation of probes unrealistically assume independence of genes, while those that rely on permutation of sample are suitable only for two-group comparisons with a good number of replicates in each group.},
  author = {Wu, Di and Lim, Elgene and Vaillant, Fran{\c c}ois and Asselin-Labat, Marie-Liesse and Visvader, Jane E and Smyth, Gordon K},
  date = {2010-09-01},
  doi = {10/cvnw38},
  eprint = {20610611},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Wu et al. - 2010 - ROAST rotation gene set tests for complex microar.pdf},
  issn = {1367-4811},
  journaltitle = {Bioinformatics (Oxford, England)},
  keywords = {Algorithms,Animals,Gene Expression Profiling,Gene Expression Profiling: methods,Humans,Linear Models,Mice,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods},
  number = {17},
  pages = {2176-82},
  title = {{{ROAST}}: Rotation Gene Set Tests for Complex Microarray Experiments.},
  volume = {26}
}

@article{Wu2010b,
  abstract = {Next-generation sequencing captures sequence differences in reads relative to a reference genome or transcriptome, including splicing events and complex variants involving multiple mismatches and long indels. We present computational methods for fast detection of complex variants and splicing in short reads, based on a successively constrained search process of merging and filtering position lists from a genomic index. Our methods are implemented in GSNAP (Genomic Short-read Nucleotide Alignment Program), which can align both single- and paired-end reads as short as 14 nt and of arbitrarily long length. It can detect short- and long-distance splicing, including interchromosomal splicing, in individual reads, using probabilistic models or a database of known splice sites. Our program also permits SNP-tolerant alignment to a reference space of all possible combinations of major and minor alleles, and can align reads from bisulfite-treated DNA for the study of methylation state.},
  author = {Wu, Thomas D and Nacu, Serban},
  date = {2010-04-01},
  doi = {10/dp4m7h},
  eprint = {20147302},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Wu and Nacu - 2010 - Fast and SNP-tolerant detection of complex variant.pdf;/Users/ryan/Documents/Zotero Library/Wu and Nacu - 2010 - Fast and SNP-tolerant detection of complex variant2.pdf},
  issn = {1367-4811},
  journaltitle = {Bioinformatics (Oxford, England)},
  keywords = {Base Sequence,DNA; Recombinant,Genetic Variation,Genomics,Genomics: methods,Polymorphism; Single Nucleotide,RNA Splicing},
  number = {7},
  pages = {873-81},
  title = {Fast and {{SNP}}-Tolerant Detection of Complex Variants and Splicing in Short Reads.},
  volume = {26}
}

@article{Wu2010c,
  abstract = {BACKGROUND: Recent technological advancements have made high throughput sequencing an increasingly popular approach for transcriptome analysis. Advantages of sequencing-based transcriptional profiling over microarrays have been reported, including lower technical variability. However, advances in technology do not remove biological variation between replicates and this variation is often neglected in many analyses.

RESULTS: We propose an empirical Bayes method, titled Analysis of Sequence Counts (ASC), to detect differential expression based on sequencing technology. ASC borrows information across sequences to establish prior distribution of sample variation, so that biological variation can be accounted for even when replicates are not available. Compared to current approaches that simply tests for equality of proportions in two samples, ASC is less biased towards highly expressed sequences and can identify more genes with a greater log fold change at lower overall abundance.

CONCLUSIONS: ASC unifies the biological and statistical significance of differential expression by estimating the posterior mean of log fold change and estimating false discovery rates based on the posterior mean. The implementation in R is available at http://www.stat.brown.edu/Zwu/research.aspx.},
  author = {Wu, Zhijin and Jenkins, Bethany D and a Rynearson, Tatiana and Dyhrman, Sonya T and a Saito, Mak and Mercier, Melissa and Whitney, Leann P},
  date = {2010-01},
  doi = {10/dzqkn7},
  eprint = {21080965},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Wu et al. - 2010 - Empirical bayes analysis of sequencing-based trans.pdf},
  issn = {1471-2105},
  journaltitle = {BMC bioinformatics},
  keywords = {Bayes Theorem,Databases; Genetic,Gene Expression Profiling,Gene Expression Profiling: methods,Genomics,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,Sequence Analysis; DNA,Sequence Analysis; RNA},
  number = {1},
  pages = {564},
  title = {Empirical Bayes Analysis of Sequencing-Based Transcriptional Profiling without Replicates.},
  volume = {11}
}

@article{Wu2012,
  abstract = {Competitive gene set tests are commonly used in molecular pathway analysis to test for enrichment of a particular gene annotation category amongst the differential expression results from a microarray experiment. Existing gene set tests that rely on gene permutation are shown here to be extremely sensitive to inter-gene correlation. Several data sets are analyzed to show that inter-gene correlation is non-ignorable even for experiments on homogeneous cell populations using genetically identical model organisms. A new gene set test procedure (CAMERA) is proposed based on the idea of estimating the inter-gene correlation from the data, and using it to adjust the gene set test statistic. An efficient procedure is developed for estimating the inter-gene correlation and characterizing its precision. CAMERA is shown to control the type I error rate correctly regardless of inter-gene correlations, yet retains excellent power for detecting genuine differential expression. Analysis of breast cancer data shows that CAMERA recovers known relationships between tumor subtypes in very convincing terms. CAMERA can be used to analyze specified sets or as a pathway analysis tool using a database of molecular signatures.},
  author = {Wu, Di and Smyth, Gordon K},
  date = {2012-09-01},
  doi = {10/gbbwb9},
  eprint = {22638577},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Wu and Smyth - 2012 - Camera a competitive gene set test accounting for.pdf},
  issn = {1362-4962},
  journaltitle = {Nucleic acids research},
  number = {17},
  pages = {e133},
  title = {Camera: A Competitive Gene Set Test Accounting for Inter-Gene Correlation.},
  volume = {40}
}

@article{Wu2012a,
  abstract = {Recent developments in RNA-sequencing (RNA-seq) technology have led to a rapid increase in gene expression data in the form of counts. RNA-seq can be used for a variety of applications, however, identifying differential expression (DE) remains a key task in functional genomics. There have been a number of statistical methods for DE detection for RNA-seq data. One common feature of several leading methods is the use of the negative binomial (Gamma-Poisson mixture) model. That is, the unobserved gene expression is modeled by a gamma random variable and, given the expression, the sequencing read counts are modeled as Poisson. The distinct feature in various methods is how the variance, or dispersion, in the Gamma distribution is modeled and estimated. We evaluate several large public RNA-seq datasets and find that the estimated dispersion in existing methods does not adequately capture the heterogeneity of biological variance among samples. We present a new empirical Bayes shrinkage estimate of the dispersion parameters and demonstrate improved DE detection.},
  author = {Wu, H. and Wang, Chi and Wu, Zhijin},
  date = {2013-04-01},
  doi = {10/gb87sx},
  eprint = {23001152},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Wu et al. - 2013 - A new shrinkage estimator for dispersion improves 2.pdf;/Users/ryan/Documents/Zotero Library/Wu et al. - 2013 - A new shrinkage estimator for dispersion improves 3.pdf},
  issn = {1465-4644},
  journaltitle = {Biostatistics},
  keywords = {Bayes Theorem,Binomial Distribution,Biostatistics,Databases,Gene Expression Profiling,Gene Expression Profiling: statistics & numerical,Humans,Models,Nucleic Acid,Nucleic Acid: statistics & numerical da,Nucleic Acid: statistics & numerical data,Poisson Distribution,RNA,RNA: statistics & numerical dat,RNA: statistics & numerical data,Sequence Analysis,Statistical},
  number = {2},
  pages = {232-243},
  title = {A New Shrinkage Estimator for Dispersion Improves Differential Expression Detection in {{RNA}}-Seq Data},
  volume = {14}
}

@article{Wu2013,
  abstract = {MOTIVATION: Although chromatin immunoprecipitation coupled with high-throughput sequencing (ChIP-seq) or tiling array hybridization (ChIP-chip) is increasingly used to map genome-wide-binding sites of transcription factors (TFs), it still remains difficult to generate a quality ChIPx (i.e. ChIP-seq or ChIP-chip) dataset because of the tremendous amount of effort required to develop effective antibodies and efficient protocols. Moreover, most laboratories are unable to easily obtain ChIPx data for one or more TF(s) in more than a handful of biological contexts. Thus, standard ChIPx analyses primarily focus on analyzing data from one experiment, and the discoveries are restricted to a specific biological context. RESULTS: We propose to enrich this existing data analysis paradigm by developing a novel approach, ChIP-PED, which superimposes ChIPx data on large amounts of publicly available human and mouse gene expression data containing a diverse collection of cell types, tissues and disease conditions to discover new biological contexts with potential TF regulatory activities. We demonstrate ChIP-PED using a number of examples, including a novel discovery that MYC, a human TF, plays an important functional role in pediatric Ewing sarcoma cell lines. These examples show that ChIP-PED increases the value of ChIPx data by allowing one to expand the scope of possible discoveries made from a ChIPx experiment. AVAILABILITY: http://www.biostat.jhsph.edu/\textasciitilde{}gewu/ChIPPED/},
  author = {Wu, George and Yustein, Jason T. and McCall, Matthew N. and Zilliox, Michael and Irizarry, Rafael A. and Zeller, Karen and Dang, Chi V. and Ji, Hongkai},
  date = {2013},
  doi = {10/f4w3ph},
  eprint = {23457041},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Wu et al. - 2013 - ChIP-PED enhances the analysis of ChIP-seq and ChI.pdf},
  isbn = {1367-4811 (Electronic)\textbackslash{}r1367-4803 (Linking)},
  issn = {1460-2059},
  journaltitle = {Bioinformatics},
  number = {9},
  pages = {1182-1189},
  title = {{{ChIP}}-{{PED}} Enhances the Analysis of {{ChIP}}-Seq and {{ChIP}}-Chip Data},
  volume = {29}
}

@article{wuNewShrinkageEstimator2013,
  abstract = {Abstract.  Recent developments in RNA-sequencing (RNA-seq) technology have led to a rapid increase in gene expression data in the form of counts. RNA-seq can be},
  author = {Wu, Hao and Wang, Chi and Wu, Zhijin},
  date = {2013-04-01},
  doi = {10/gb87sx},
  file = {/Users/ryan/Documents/Zotero Library/Wu et al. - 2013 - A new shrinkage estimator for dispersion improves .pdf;/Users/ryan/Zotero/storage/9DDXQ5VY/376433.html},
  issn = {1465-4644},
  journaltitle = {Biostatistics},
  langid = {english},
  number = {2},
  pages = {232-243},
  shortjournal = {Biostatistics},
  title = {A New Shrinkage Estimator for Dispersion Improves Differential Expression Detection in {{RNA}}-Seq Data},
  volume = {14}
}

@incollection{xuIdentifyingDifferentialHistone2012,
  abstract = {Epigenetic modifications are critical to gene regulations and genome functions. Among different epigenetic modifications, it is of great interest to study the differential histone modification sites (DHMSs), which contribute to the epigenetic dynamics and the gene regulations among various cell-types or environmental responses. ChIP-seq is a robust and comprehensive approach to capture the histone modifications at the whole genome scale. By comparing two histone modification ChIP-seq libraries, the DHMSs are potentially identifiable. With this aim, we proposed an approach called ChIPDiff for the genome-wide comparison of histone modification sites identified by ChIP-seq (Xu, Wei, Lin et al., Bioinformatics 24:2344\textendash{}2349, 2008). The approach employs a hidden Markov model (HMM) to infer the states of histone modification changes at each genomic location. We evaluated the performance of ChIPDiff by comparing the H3K27me3 modification sites between mouse embryonic stem cell (ESC) and neural progenitor cell (NPC). We demonstrated that the H3K27me3 DHMSs identified by our approach are of high sensitivity, specificity, and technical reproducibility. ChIPDiff was further applied to uncover the differential H3K4me3 and H3K36me3 sites between different cell states. The result showed significant correlation between the histone modification states and the gene expression levels.},
  author = {Xu, Han and Sung, Wing-Kin},
  booktitle = {Next {{Generation Microarray Bioinformatics}}: {{Methods}} and {{Protocols}}},
  date = {2012},
  doi = {10/b5sbgm},
  editor = {Wang, Junbai and Tan, Aik Choon and Tian, Tianhai},
  file = {/Users/ryan/Documents/Zotero Library/Xu and Sung - 2012 - Identifying Differential Histone Modification Site.pdf},
  isbn = {978-1-61779-400-1},
  keywords = {ChIP-seq,ChIPDiff,Differential histone modification site,Epigenetic modification,Hidden Markov model},
  langid = {english},
  location = {{Totowa, NJ}},
  pages = {293-303},
  publisher = {{Humana Press}},
  series = {Methods in {{Molecular Biology}}},
  title = {Identifying {{Differential Histone Modification Sites}} from {{ChIP}}-seq {{Data}}},
  url = {https://doi.org/10.1007/978-1-61779-400-1_19},
  urldate = {2019-11-14}
}

@article{Yaari2013,
  abstract = {Enrichment analysis of gene sets is a popular approach that provides a functional interpretation of genome-wide expression data. Existing tests are affected by inter-gene correlations, resulting in a high Type I error. The most widely used test, Gene Set Enrichment Analysis, relies on computationally intensive permutations of sample labels to generate a null distribution that preserves gene-gene correlations. A more recent approach, CAMERA, attempts to correct for these correlations by estimating a variance inflation factor directly from the data. Although these methods generate P-values for detecting gene set activity, they are unable to produce confidence intervals or allow for post hoc comparisons. We have developed a new computational framework for Quantitative Set Analysis of Gene Expression (QuSAGE). QuSAGE accounts for inter-gene correlations, improves the estimation of the variance inflation factor and, rather than evaluating the deviation from a null hypothesis with a P-value, it quantifies gene-set activity with a complete probability density function. From this probability density function, P-values and confidence intervals can be extracted and post hoc analysis can be carried out while maintaining statistical traceability. Compared with Gene Set Enrichment Analysis and CAMERA, QuSAGE exhibits better sensitivity and specificity on real data profiling the response to interferon therapy (in chronic Hepatitis C virus patients) and Influenza A virus infection. QuSAGE is available as an R package, which includes the core functions for the method as well as functions to plot and visualize the results.},
  author = {Yaari, Gur and Bolen, Christopher R and Thakar, Juilee and Kleinstein, Steven H},
  date = {2013-10},
  doi = {10/gbdx65},
  eprint = {23921631},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Yaari et al. - 2013 - Quantitative set analysis for gene expression a m.pdf},
  issn = {1362-4962},
  journaltitle = {Nucleic acids research},
  keywords = {Confidence Intervals,Data Interpretation; Statistical,Gene Expression Profiling,Gene Expression Profiling: methods,Genes,Humans,Influenza; Human,Influenza; Human: genetics,Influenza; Human: metabolism},
  number = {18},
  pages = {e170},
  title = {Quantitative Set Analysis for Gene Expression: A Method to Quantify Gene Set Differential Expression Including Gene-Gene Correlations.},
  volume = {41}
}

@article{Yan2011,
  abstract = {The nonhuman primates most commonly used in medical research are from the genus Macaca. To better understand the genetic differences between these animal models, we present high-quality draft genome sequences from two macaque species, the cynomolgus/crab-eating macaque and the Chinese rhesus macaque. Comparison with the previously sequenced Indian rhesus macaque reveals that all three macaques maintain abundant genetic heterogeneity, including millions of single-nucleotide substitutions and many insertions, deletions and gross chromosomal rearrangements. By assessing genetic regions with reduced variability, we identify genes in each macaque species that may have experienced positive selection. Genetic divergence patterns suggest that the cynomolgus macaque genome has been shaped by introgression after hybridization with the Chinese rhesus macaque. Macaque genes display a high degree of sequence similarity with human disease gene orthologs and drug targets. However, we identify several putatively dysfunctional genetic differences between the three macaque species, which may explain functional differences between them previously observed in clinical studies.},
  author = {Yan, Guangmei and Zhang, Guojie and Fang, Xiaodong and Zhang, Yanfeng Yong and Li, Cai and Ling, Fei and Cooper, David N and Li, Qiye and Li, Yingrui Yan and van Gool, Alain J and Du, Hongli and Chen, Jiesi and Chen, Ronghua and Zhang, Pei and Huang, Zhiyong and Thompson, John R and Meng, Yuhuan and Bai, Yinqi and Wang, Jian Jun Jufang and Zhuo, Min and Wang, Tao and Huang, Ying and Wei, Liqiong and Li, Jianwen and Wang, Zhiwen and Hu, Haofu and Yang, Pengcheng and Le, Liang and Stenson, Peter D and Li, Bo and Liu, Xiaoming and Ball, Edward V and An, Na and Huang, Quanfei and Zhang, Yanfeng Yong and Fan, Wei and Zhang, Xiuqing and Li, Yingrui Yan and Wang, Wen and Katze, Michael G and Su, Bing and Nielsen, Rasmus and Yang, Huanming and Wang, Jian Jun Jufang and Wang, Xiaoning and Wang, Jian Jun Jufang},
  date = {2011-11},
  doi = {10/dhnhpg},
  eprint = {22002653},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Yan et al. - 2011 - Genome sequencing and comparison of two nonhuman p.pdf;/Users/ryan/Documents/Zotero Library/Yan et al. - 2011 - Genome sequencing and comparison of two nonhuman p2.pdf},
  issn = {1546-1696},
  journaltitle = {Nature biotechnology},
  keywords = {Animal,Animals,Base Sequence,Chromosome Aberrations,cyno-genome,cyno-project,DNA,Evolution,Genetic,Genome,Humans,Macaca fascicularis,Macaca fascicularis: genetics,Macaca mulatta,Macaca mulatta: genetics,Models,Molecular,Molecular Sequence Data,Polymorphism,Sequence Analysis,Species Specificity},
  number = {11},
  options = {useprefix=true},
  pages = {1019-23},
  title = {Genome Sequencing and Comparison of Two Nonhuman Primate Animal Models, the Cynomolgus and {{Chinese}} Rhesus Macaques.},
  volume = {29}
}

@article{Yandell2012,
  abstract = {The falling cost of genome sequencing is having a marked impact on the research community with respect to which genomes are sequenced and how and where they are annotated. Genome annotation projects have generally become small-scale affairs that are often carried out by an individual laboratory. Although annotating a eukaryotic genome assembly is now within the reach of non-experts, it remains a challenging task. Here we provide an overview of the genome annotation process and the available tools and describe some best-practice approaches.},
  author = {Yandell, Mark and Ence, Daniel},
  date = {2012-05},
  doi = {10/gfkndq},
  eprint = {22510764},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Yandell and Ence - 2012 - A beginner's guide to eukaryotic genome annotation.pdf},
  issn = {1471-0064},
  journaltitle = {Nature reviews. Genetics},
  keywords = {Base Sequence,Databases; Genetic,Eukaryota,Eukaryota: genetics,Exons,Genome,Genomics,Humans,Introns,Molecular Sequence Annotation,Molecular Sequence Annotation: methods,Molecular Sequence Annotation: standards,Molecular Sequence Data,Quality Control,RNA,RNA: genetics,Sequence Alignment,Software},
  number = {5},
  pages = {329-42},
  title = {A Beginner's Guide to Eukaryotic Genome Annotation.},
  volume = {13}
}

@article{Yang2005,
  abstract = {A common objective of microarray experiments is the detection of differential gene expression between samples obtained under different conditions. The task of identifying differentially expressed genes consists of two aspects: ranking and selection. Numerous statistics have been proposed to rank genes in order of evidence for differential expression. However, no one statistic is universally optimal and there is seldom any basis or guidance that can direct toward a particular statistic of choice.},
  author = {Yang, Yee Hwa and Xiao, Yuanyuan and Segal, Mark R},
  date = {2005-04-01},
  doi = {10/c9fch2},
  eprint = {15513985},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Yang et al. - 2005 - Identifying differentially expressed genes from mi.pdf},
  issn = {1367-4803},
  journaltitle = {Bioinformatics (Oxford, England)},
  keywords = {Algorithms,Computer Simulation,Data Interpretation; Statistical,Gene Expression Profiling,Gene Expression Profiling: methods,Models; Genetic,Models; Statistical,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,Sequence Analysis; DNA,Sequence Analysis; DNA: methods,Software},
  number = {7},
  pages = {1084-93},
  title = {Identifying Differentially Expressed Genes from Microarray Experiments via Statistic Synthesis.},
  volume = {21}
}

@article{Yang2011,
  abstract = {Alfalfa, [Medicago sativa (L.) sativa], a widely-grown perennial forage has potential for development as a cellulosic ethanol feedstock. However, the genomics of alfalfa, a non-model species, is still in its infancy. The recent advent of RNA-Seq, a massively parallel sequencing method for transcriptome analysis, provides an opportunity to expand the identification of alfalfa genes and polymorphisms, and conduct in-depth transcript profiling.},
  author = {Yang, S Samuel and Tu, Zheng Jin and Cheung, Foo and Xu, Wayne Wenzhong and Lamb, JoAnn F S and Jung, Hans-Joachim G and Vance, Carroll P and Gronwald, John W},
  date = {2011-01},
  doi = {10/c28x3m},
  eprint = {21504589},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Yang et al. - 2011 - Using RNA-Seq for gene identification, polymorphis.pdf},
  issn = {1471-2164},
  journaltitle = {BMC genomics},
  keywords = {Cell Wall,Cell Wall: metabolism,Gene Expression Profiling,Genes; Plant,Genes; Plant: genetics,Genotype,High-Throughput Nucleotide Sequencing,High-Throughput Nucleotide Sequencing: methods,Medicago sativa,Medicago sativa: cytology,Medicago sativa: genetics,Minisatellite Repeats,Minisatellite Repeats: genetics,Molecular Sequence Annotation,Plant Stems,Plant Stems: cytology,Plant Stems: genetics,Polymorphism; Single Nucleotide,Polymorphism; Single Nucleotide: genetics,RNA; Messenger,RNA; Messenger: genetics,Sequence Analysis; RNA,Sequence Analysis; RNA: methods,Titanium},
  number = {1},
  pages = {199},
  title = {Using {{RNA}}-{{Seq}} for Gene Identification, Polymorphism Detection and Transcript Profiling in Two Alfalfa Genotypes with Divergent Cell Wall Composition in Stems.},
  volume = {12}
}

@article{Yang2013,
  abstract = {Recently, long noncoding RNAs (lncRNAs) were found to be dysregulated in a variety of tumors. However, it remains unknown how and through what molecular mechanisms the expression of lncRNAs is controlled. In this study, we found that the lncRNA Low Expression in Tumor (lncRNA-LET) was generally downregulated in hepatocellular carcinomas, colorectal cancers, and squamous-cell lung carcinomas. We demonstrated that hypoxia-induced histone deacetylase 3 repressed lncRNA-LET by reducing the histone acetylation-mediated modulation of the lncRNA-LET promoter region. Interestingly, the downregulation of lncRNA-LET was found to be a key step in the stabilization of nuclear factor 90 protein, which leads to hypoxia-induced cancer cell invasion. Moreover, the relationship among hypoxia, histone acetylation disorder, low lncRNA-LET expression level, and metastasis was found in clinical hepatocellular carcinoma samples. These results advance our understanding of the role of lncRNA-LET as a regulator of hypoxia signaling and offer new avenues for therapeutic intervention against cancer progression. \textcopyright{} 2013 Elsevier Inc.},
  author = {Yang, Fu and song Huo, Xi and xian Yuan, Sheng and Zhang, Ling and ping Zhou, Wei and Wang, Fang and han Sun, Shu},
  date = {2013},
  doi = {10/f4rf2k},
  eprint = {23395002},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Yang et al. - 2013 - Repression of the Long Noncoding RNA-LET by Histon.pdf},
  isbn = {1097-4164 (Electronic)\textbackslash{}r1097-2765 (Linking)},
  issn = {10972765},
  journaltitle = {Molecular Cell},
  number = {6},
  pages = {1083-1096},
  title = {Repression of the {{Long Noncoding RNA}}-{{LET}} by {{Histone Deacetylase}} 3 {{Contributes}} to {{Hypoxia}}-{{Mediated Metastasis}}},
  volume = {49}
}

@article{Yi2017,
  abstract = {Motivation Batch effects are one of the major source of technical variations that affect the measurements in high-Throughput studies such as RNA sequencing. It has been well established that batch effects can be caused by different experimental platforms, laboratory conditions, different sources of samples and personnel differences. These differences can confound the outcomes of interest and lead to spurious results. A critical input for batch correction algorithms is the knowledge of batch factors, which in many cases are unknown or inaccurate. Hence, the primary motivation of our paper is to detect hidden batch factors that can be used in standard techniques to accurately capture the relationship between gene expression and other modeled variables of interest. Results We introduce a new algorithm based on data-Adaptive shrinkage and semi-Non-negative Matrix Factorization for the detection of unknown batch effects. We test our algorithm on three different datasets: (i) Sequencing Quality Control, (ii) Topotecan RNA-Seq and (iii) Single-cell RNA sequencing (scRNA-Seq) on Glioblastoma Multiforme. We have demonstrated a superior performance in identifying hidden batch effects as compared to existing algorithms for batch detection in all three datasets. In the Topotecan study, we were able to identify a new batch factor that has been missed by the original study, leading to under-representation of differentially expressed genes. For scRNA-Seq, we demonstrated the power of our method in detecting subtle batch effects. Availability and implementation DASC R package is available via Bioconductor or at https://github.com/zhanglabNKU/DASC. Contact zhanghan@nankai.edu.cn or zhandonl@bcm.edu Supplementary informationSupplementary dataare available at Bioinformatics online.},
  author = {Yi, Haidong and Raman, Ayush T. and Zhang, Han and Allen, Genevera I. and Liu, Zhandong},
  date = {2018},
  doi = {10/gc9hk2},
  eprint = {29617963},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Yi et al. - 2018 - Detecting hidden batch factors through data-Adapti.pdf},
  issn = {14602059},
  journaltitle = {Bioinformatics},
  number = {7},
  pages = {1141-1147},
  title = {Detecting Hidden Batch Factors through Data-{{Adaptive}} Adjustment for Biological Effects},
  volume = {34}
}

@article{Yokoi2018,
  abstract = {Most of type 2 diabetes (T2D) is thought to be the result of interaction between  genetic and environmental factors. However, the genetic components discovered to date can explain only a small proportion of the observed heritability. The "missing heritability" may be accounted for by rare variants, gene-environment interactions, and epigenetics.},
  author = {Yokoi, Norihide},
  date = {2018},
  doi = {10/ggcxm5},
  file = {/Users/ryan/Documents/Zotero Library/Yokoi - 2018 - Epigenetic dysregulation in pancreatic islets and .pdf},
  issn = {20401124},
  journaltitle = {Journal of Diabetes Investigation},
  number = {3},
  pages = {475-477},
  title = {Epigenetic Dysregulation in Pancreatic Islets and Pathogenesis of Type 2 Diabetes},
  volume = {9}
}

@article{Young2010,
  abstract = {We present GOseq, an application for performing Gene Ontology (GO) analysis on RNA-seq data. GO analysis is widely used to reduce complexity and highlight biological processes in genome-wide expression studies, but standard methods give biased results on RNA-seq data due to over-detection of differential expression for long and highly expressed transcripts. Application of GOseq to a prostate cancer data set shows that GOseq dramatically changes the results, highlighting categories more consistent with the known biology.},
  author = {Young, Matthew D and Wakefield, Matthew J and Smyth, Gordon K and Oshlack, Alicia},
  date = {2010-01},
  doi = {10/czz6tn},
  eprint = {20132535},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Young et al. - 2010 - Gene ontology analysis for RNA-seq accounting for.pdf},
  issn = {1465-6914},
  journaltitle = {Genome biology},
  keywords = {Androgens,Androgens: pharmacology,Bias (Epidemiology),Cell Line; Tumor,Gene Expression Regulation; Neoplastic,Genome-Wide Association Study,Humans,Male,Prostatic Neoplasms,Prostatic Neoplasms: genetics,Sequence Analysis; RNA,Sequence Analysis; RNA: methods},
  number = {2},
  pages = {R14},
  title = {Gene Ontology Analysis for {{RNA}}-Seq: Accounting for Selection Bias.},
  volume = {11}
}

@article{youngChIPseqAnalysisReveals2011,
  abstract = {Transcriptional control is dependent on a vast network of epigenetic modifications. One epigenetic mark of particular interest is tri-methylation of lysine 27 on histone H3 (H3K27me3), which is catalysed and maintained by Polycomb Repressive Complex 2 (PRC2). Although this histone mark is studied widely, the precise relationship between its local pattern of enrichment and regulation of gene expression is currently unclear. We have used ChIP-seq to generate genome-wide maps of H3K27me3 enrichment, and have identified three enrichment profiles with distinct regulatory consequences. First, a broad domain of H3K27me3 enrichment across the body of genes corresponds to the canonical view of H3K27me3 as inhibitory to transcription. Second, a peak of enrichment around the transcription start site (TSS) is commonly associated with 'bivalent' genes, where H3K4me3 also marks the TSS. Finally and most surprisingly, we identified an enrichment profile with a peak in the promoter of genes that is associated with active transcription. Genes with each of these three profiles were found in different proportions in each of the cell types studied. The data analysis techniques developed here will be useful for the identification of common enrichment profiles for other histone modifications that have important consequences for transcriptional regulation. \textcopyright{} 2011 The Author(s).},
  author = {Young, Matthew D. and Willson, Tracy A. and Wakefield, Matthew J. and Trounson, Evelyn and Hilton, Douglas J. and Blewitt, Marnie E. and Oshlack, Alicia and Majewski, Ian J.},
  date = {2011-09},
  doi = {10/cp6h62},
  eprint = {21652639},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Young et al. - 2011 - ChIP-seq analysis reveals distinct H3K27me3 profil.pdf},
  ids = {Young2011},
  isbn = {1362-4962 (Electronic)\textbackslash{}r0305-1048 (Linking)},
  issn = {1362-4962},
  journaltitle = {Nucleic Acids Research},
  number = {17},
  pages = {7415-7427},
  title = {{{ChIP}}-Seq Analysis Reveals Distinct {{H3K27me3}} Profiles That Correlate with Transcriptional Activity},
  volume = {39}
}

@article{Zachariah2018,
  abstract = {Objectives: Prognostic implications of early protocol biopsies have been studied; however, the value of late protocol biopsy in predicting graft outcome has not been well defined. Here, we compared the effects of early and late protocol biopsy histologic findings in stable kidney allografts and aimed to understand the significance of ``borderline'' rejection on allograft function. Materials and Methods:We studied 261 biopsies from 159 renal transplant recipients who were on a steroidfree, calcineurin inhibitor and mycophenolate mofetil regimen and who received transplants between 2004 and 2012 with mean follow-up of 5 years. Early (between 3 and 9 mo) and subsequent late (between 12 and 24 mo) protocol biopsies were performed. Biopsies were classified as normal, interstitial fibrosis and/or tubular atrophy, subclinical acute rejection with interstitial fibrosis and/or tubular atrophy, and borderline rejection with interstitial fibrosis and/or tubular atrophy. A linear mixed-effects model was used to determine the effects of early and late protocol biopsies on estimated glomerular filtration rate changes, with baseline time for estimated glomerular filtration rate fixed at 12 months. Results: The adjusted model showed that estimated glomerular filtration rate at 3 months, donor age, delayed graft function, and early protocol biopsies were associated with baseline estimated glomerular filtration rate at 12 months. Estimated glomerular filtration rate changes over time were associated with findings of interstitial fibrosis and/or tubular atrophy at early biopsy and subclinical acute rejection and borderline rejection at late biopsy. At last follow-up, final estimated glomerular filtration rate was significantly associated with interstitial fibrosis and/or tubular atrophy at early biopsy and with subclinical acute rejection at late biopsy. Conclusions: Although early protocol biopsy predicted baseline estimated glomerular filtration rate, late biopsy was important for predicting changes in function over time. In addition, a diagnosis of ``borderline'' rejection on protocol biopsies predicted long-term graft function.},
  author = {Zachariah, Mareena S. and Dwivedi, Alok K. and Yip, Cindy S. and Chang, Shirley S. and Gundroo, Aijaz and Venuto, Rocco C. and Tomaszewski, John and Patel, Sunil K. and Sharma, Rajeev},
  date = {2018},
  doi = {10/ggcxm6},
  file = {/Users/ryan/Documents/Zotero Library/Zachariah et al. - 2018 - Utility of serial protocol biopsies performed afte.pdf},
  issn = {13040855},
  journaltitle = {Experimental and Clinical Transplantation},
  keywords = {Borderline rejection,Interstitial fibrosis and/or tubular atrophy,Subclinical acute rejection,Surveillance biopsies},
  number = {4},
  pages = {391-400},
  title = {Utility of Serial Protocol Biopsies Performed after 1 Year in Predicting Long-Term Kidney Allograft Function According to Histologic Phenotype},
  volume = {16}
}

@article{Zaharia2011,
  abstract = {We present the Scalable Nucleotide Alignment Program (SNAP), a new short and long read aligner that is both more accurate (i.e., aligns more reads with fewer errors) and 10-100x faster than state-of-the-art tools such as BWA. Unlike recent aligners based on the Burrows-Wheeler transform, SNAP uses a simple hash index of short seed sequences from the genome, similar to BLAST's. However, SNAP greatly reduces the number and cost of local alignment checks performed through several measures: it uses longer seeds to reduce the false positive locations considered, leverages larger memory capacities to speed index lookup, and excludes most candidate locations without fully computing their edit distance to the read. The result is an algorithm that scales well for reads from one hundred to thousands of bases long and provides a rich error model that can match classes of mutations (e.g., longer indels) that today's fast aligners ignore. We calculate that SNAP can align a dataset with 30x coverage of a human genome in less than an hour for a cost of \$2 on Amazon EC2, with higher accuracy than BWA. Finally, we describe ongoing work to further improve SNAP.},
  archivePrefix = {arXiv},
  author = {Zaharia, Matei and Bolosky, William J. and Curtis, Kristal and Fox, Armando and Patterson, David and Shenker, Scott and Stoica, Ion and Karp, Richard M. and Sittler, Taylor},
  date = {2011-11-23},
  eprint = {1111.5572},
  eprinttype = {arxiv},
  file = {/Users/ryan/Documents/Zotero Library/Zaharia et al. - 2011 - Faster and More Accurate Sequence Alignment with S.pdf;/Users/ryan/Zotero/storage/J69PC68R/1111.html},
  keywords = {⛔ No DOI found,Computer Science - Data Structures and Algorithms,Quantitative Biology - Genomics},
  primaryClass = {cs, q-bio},
  title = {Faster and {{More Accurate Sequence Alignment}} with {{SNAP}}},
  url = {http://arxiv.org/abs/1111.5572},
  urldate = {2019-11-14}
}

@article{Zaher2009,
  abstract = {The overall fidelity of protein synthesis has been thought to rely on the combined accuracy of two basic processes: the aminoacylation of transfer RNAs with their cognate amino acid by the aminoacyl-tRNA synthetases, and the selection of cognate aminoacyl-tRNAs by the ribosome in cooperation with the GTPase elongation factor EF-Tu. These two processes, which together ensure the specific acceptance of a correctly charged cognate tRNA into the aminoacyl (A) site, operate before peptide bond formation. Here we report the identification of an additional mechanism that contributes to high fidelity protein synthesis after peptidyl transfer, using a well-defined in vitro bacterial translation system. In this retrospective quality control step, the incorporation of an amino acid from a non-cognate tRNA into the growing polypeptide chain leads to a general loss of specificity in the A site of the ribosome, and thus to a propagation of errors that results in abortive termination of protein synthesis.},
  author = {Zaher, Hani S and Green, Rachel},
  date = {2009-01-08},
  doi = {10/czsh35},
  eprint = {19092806},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Zaher and Green - 2009 - Quality control by the ribosome following peptide .pdf},
  issn = {1476-4687},
  journaltitle = {Nature},
  keywords = {Aminoacylation,Anticodon,Anticodon: genetics,Base Pair Mismatch,Base Pair Mismatch: genetics,Biocatalysis,Codon,Codon: genetics,Escherichia coli,Escherichia coli Proteins,Escherichia coli Proteins: metabolism,Escherichia coli: enzymology,Escherichia coli: genetics,Escherichia coli: metabolism,Peptide Chain Termination; Translational,Peptide Termination Factors,Peptide Termination Factors: metabolism,Protein Biosynthesis,Ribosomes,Ribosomes: chemistry,Ribosomes: genetics,Ribosomes: metabolism,RNA; Transfer,RNA; Transfer: genetics,RNA; Transfer: metabolism,Substrate Specificity},
  number = {7226},
  pages = {161-6},
  title = {Quality Control by the Ribosome Following Peptide Bond Formation.},
  volume = {457}
}

@article{Zakeri2017,
  author = {Zakeri, Mohsen and Srivastava, Avi and Almodaresi, Fatemeh and Patro, Rob},
  date = {2017},
  doi = {10/gbnk89},
  file = {/Users/ryan/Documents/Zotero Library/Zakeri et al. - 2017 - Improved data-driven likelihood factorizations for.pdf},
  issn = {1367-4803},
  journaltitle = {Bioinformatics},
  number = {14},
  pages = {i142-i151},
  title = {Improved Data-Driven Likelihood Factorizations for Transcript Abundance Estimation},
  volume = {33}
}

@article{Zakeri2017a,
  abstract = {Recent ALMA observations of high-redshift normal galaxies have been providing a great opportunity to clarify the general origin of dust in the Universe, not biased to very bright special objects even at \$z{$>$}6\$. To clarify what constraint we can get for the dust enrichment in normal galaxies detected by ALMA, we use a theoretical model that includes major processes driving dust evolution in a galaxy; that is, dust condensation in stellar ejecta, dust growth by the accretion of gas-phase metals, and supernova destruction. Using the dust emission fluxes detected in two normal galaxies at \$z{$>$}6\$ by ALMA as a constraint, we can get the range of the time-scales (or efficiencies) of the above mentioned processes. We find that if we assume extremely high condensation efficiency in stellar ejecta (\$f\_\{\textbackslash{}mathrm\{in\}\} \textbackslash{}ga 0.5\$), rapid dust enrichment by stellar sources in the early phase may be enough to explain the observed ALMA flux, unless dust destruction by supernovae in those galaxies is stronger than that in nearby galaxies. If we assume a condensation efficiency expected from theoretical calculations (\$f\_\{\textbackslash{}mathrm\{in\}\} \textbackslash{}la 0.1\$), strong dust growth (even stronger than assumed for nearby galaxies if they are metal-poor galaxies) is required. These results indicate that the normal galaxies detected by ALMA at \$z{$>$}6\$ are biased to objects (i) with high dust condensation efficiency in stellar ejecta, (ii) with strong dust growth in very dense molecular clouds, or (iii) with efficient dust growth because of fast metal enrichment up to solar metallicity. A measurement of metallicity is crucial to distinguish among these possibilities.},
  archivePrefix = {arXiv},
  author = {Zakeri, Mohsen and Srivastava, Avi and Almodaresi, Fatemeh and Patro, Rob},
  date = {2017-07-15},
  doi = {10/ggcxm7},
  eprint = {27899565},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Zakeri et al. - 2017 - OUP accepted manuscript.pdf},
  isbn = {2076792171},
  issn = {1465-4644},
  journaltitle = {Biostatistics},
  number = {14},
  pages = {i142-i151},
  title = {{{OUP}} Accepted Manuscript},
  volume = {33}
}

@article{Zang2009,
  abstract = {MOTIVATION: Chromatin states are the key to gene regulation and cell identity. Chromatin immunoprecipitation (ChIP) coupled with high-throughput sequencing (ChIP-Seq) is increasingly being used to map epigenetic states across genomes of diverse species. Chromatin modification profiles are frequently noisy and diffuse, spanning regions ranging from several nucleosomes to large domains of multiple genes. Much of the early work on the identification of ChIP-enriched regions for ChIP-Seq data has focused on identifying localized regions, such as transcription factor binding sites. Bioinformatic tools to identify diffuse domains of ChIP-enriched regions have been lacking. RESULTS: Based on the biological observation that histone modifications tend to cluster to form domains, we present a method that identifies spatial clusters of signals unlikely to appear by chance. This method pools together enrichment information from neighboring nucleosomes to increase sensitivity and specificity. By using genomic-scale analysis, as well as the examination of loci with validated epigenetic states, we demonstrate that this method outperforms existing methods in the identification of ChIP-enriched signals for histone modification profiles. We demonstrate the application of this unbiased method in important issues in ChIP-Seq data analysis, such as data normalization for quantitative comparison of levels of epigenetic modifications across cell types and growth conditions. AVAILABILITY: http://home.gwu.edu/ approximately wpeng/Software.htm. SUPPLEMENTARY INFORMATION: Supplementary data are available at Bioinformatics online.},
  author = {Zang, Chongzhi and Schones, Dustin E. and Zeng, Chen and Cui, Kairong and Zhao, Keji and Peng, Weiqun},
  date = {2009},
  doi = {10/fd9qhm},
  eprint = {19505939},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Zang et al. - 2009 - A clustering approach for identification of enrich.pdf},
  isbn = {1367-4811 (Electronic)\textbackslash{}r1367-4803 (Linking)},
  issn = {13674803},
  journaltitle = {Bioinformatics},
  number = {15},
  pages = {1952-1958},
  title = {A Clustering Approach for Identification of Enriched Domains from Histone Modification {{ChIP}}-{{Seq}} Data},
  volume = {25}
}

@article{Zaykin2002,
  abstract = {We present a new procedure for combining P-values from a set of L hypothesis tests. Our procedure is to take the product of only those P-values less than some specified cut-off value and to evaluate the probability of such a product, or a smaller value, under the overall hypothesis that all L hypotheses are true. We give an explicit formulation for this P-value, and find by simulation that it can provide high power for detecting departures from the overall hypothesis. We extend the procedure to situations when tests are not independent. We present both real and simulated examples where the method is especially useful. These include exploratory analyses when L is large, such as genome-wide scans for marker-trait associations and meta-analytic applications that combine information from published studies, with potential for dealing with the "publication bias" phenomenon. Once the overall hypothesis is rejected, an adjustment procedure with strong family-wise error protection is available for smaller subsets of hypotheses, down to the individual tests.},
  author = {Zaykin, D. V. and a. Zhivotovsky, Lev and Westfall, P. H. and Weir, B. S.},
  date = {2002},
  doi = {10/cn39j9},
  eprint = {11788962},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Zaykin et al. - 2002 - Truncated product method for combining P-values.pdf},
  isbn = {1098-2272},
  issn = {07410395},
  journaltitle = {Genetic Epidemiology},
  keywords = {Bonferroni,Genome-wide scans,Meta-analysis,Microarrays,Multiple tests},
  number = {2},
  pages = {170-185},
  title = {Truncated Product Method for Combining {{P}}-Values},
  volume = {22}
}

@article{Zaykin2011,
  abstract = {The inverse normal and Fisher's methods are two common approaches for combining P-values. Whitlock demonstrated that a weighted version of the inverse normal method, or 'weighted Z-test', is superior to Fisher's method for combining P-values for one-sided T-tests. The problem with Fisher's method is that it does not take advantage of weighting and loses power to the weighted Z-test when studies are differently sized. This issue was recently revisited by Chen, who observed that Lancaster's variation of Fisher's method had higher power than the weighted Z-test. Nevertheless, the weighted Z-test has comparable power to Lancaster's method when its weights are set to square roots of sample sizes. Power can be further improved when additional information is available. Although there is no single approach that is the best in every situation, the weighted Z-test enjoys certain properties that make it an appealing choice as a combination method for meta-analysis.},
  author = {Zaykin, Dmitri V.},
  date = {2011},
  doi = {10/d2s5kk},
  eprint = {21605215},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Zaykin - 2011 - Optimally weighted Z-test is a powerful method for.pdf},
  isbn = {1420-9101 (Electronic)\textbackslash{}r1010-061X (Linking)},
  issn = {1010061X},
  journaltitle = {Journal of Evolutionary Biology},
  keywords = {Combining P-values,Meta-analysis},
  number = {8},
  pages = {1836-1841},
  title = {Optimally Weighted {{Z}}-Test Is a Powerful Method for Combining Probabilities in Meta-Analysis},
  volume = {24}
}

@article{Zeileis2008,
  author = {Zeileis, Achim and Kleiber, Christian and Jackman, Simon},
  date = {2008},
  file = {/Users/ryan/Documents/Zotero Library/Zeileis et al. - 2008 - Regression Models for Count Data in R.pdf},
  issue = {Mullahy 1986},
  keywords = {glm,hurdle model,negative binomial model,poisson model,zero-inflated model},
  title = {Regression {{Models}} for {{Count Data}} in {{R}}}
}

@article{Zerbino2018,
  abstract = {The Ensembl project has been aggregating, processing, integrating and redistributing genomic datasets since the initial releases of the draft human genome, with the aim of accelerating genomics research through rapid open distribution of public data. Large amounts of raw data are thus transformed into knowledge, which is made available via a multitude of channels, in particular our browser (http://www.ensembl.org). Over time, we have expanded in multiple directions. First, our resources describe multiple fields of genomics, in particular gene annotation, comparative genomics, genetics and epigenomics. Second, we cover a growing number of genome assemblies; Ensembl Release 90 contains exactly 100. Third, our databases feed simultaneously into an array of services designed around different use cases, ranging from quick browsing to genome-wide bioinformatic analysis. We present here the latest developments of the Ensembl project, with a focus on managing an increasing number of assemblies, supporting efforts in genome interpretation and improving our browser.},
  author = {Zerbino, Daniel R. and Achuthan, Premanand and Akanni, Wasiu and Amode, M. Ridwan and Barrell, Daniel and Bhai, Jyothish and Billis, Konstantinos and Cummins, Carla and Gall, Astrid and Gir\'on, Carlos Garc\'ia and Gil, Laurent and Gordon, Leo and Haggerty, Leanne and Haskell, Erin and Hourlier, Thibaut and Izuogu, Osagie G. and Janacek, Sophie H. and Juettemann, Thomas and To, Jimmy Kiang and Laird, Matthew R. and Lavidas, Ilias and Liu, Zhicheng and Loveland, Jane E. and Maurel, Thomas and McLaren, William and Moore, Benjamin and Mudge, Jonathan and Murphy, Daniel N. and Newman, Victoria and Nuhn, Michael and Ogeh, Denye and Ong, Chuang Kee and Parker, Anne and Patricio, Mateus and Riat, Harpreet Singh and Schuilenburg, Helen and Sheppard, Dan and Sparrow, Helen and Taylor, Kieron and Thormann, Anja and Vullo, Alessandro and Walts, Brandon and Zadissa, Amonida and Frankish, Adam and Hunt, Sarah E. and Kostadima, Myrto and Langridge, Nicholas and Martin, Fergal J. and Muffato, Matthieu and Perry, Emily and Ruffier, Magali and Staines, Dan M. and Trevanion, Stephen J. and Aken, Bronwen L. and Cunningham, Fiona and Yates, Andrew and Flicek, Paul},
  date = {2018-01-04},
  doi = {10/gcwg6r},
  file = {/Users/ryan/Documents/Zotero Library/Zerbino et al. - 2018 - Ensembl 2018.pdf},
  issn = {13624962},
  journaltitle = {Nucleic Acids Research},
  number = {D1},
  pages = {D754-D761},
  title = {Ensembl 2018},
  volume = {46}
}

@article{Zhang2000,
  abstract = {For aligning DNA sequences that differ only by sequencing errors, or by equivalent errors from other sources, a greedy algorithm can be much faster than traditional dynamic programming approaches and yet produce an alignment that is guaranteed to be theoretically optimal. We introduce a new greedy alignment algorithm with particularly good performance and show that it computes the same alignment as does a certain dynamic programming algorithm, while executing over 10 times faster on appropriate data. An implementation of this algorithm is currently used in a program that assembles the UniGene database at the National Center for Biotechnology Information.},
  author = {Zhang, Zheng and Schwartz, Scott and Wagner, Lukas and Miller, Webb},
  date = {2000-02},
  doi = {10/b6z8bg},
  eprint = {10890397},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Zhang et al. - 2000 - A greedy algorithm for aligning DNA sequences.pdf},
  issn = {10665277},
  journaltitle = {Journal of Computational Biology},
  keywords = {Dynamic programming,Greedy algorithms,Sequence alignment},
  number = {1-2},
  pages = {203-214},
  title = {A Greedy Algorithm for Aligning {{DNA}} Sequences},
  volume = {7}
}

@article{Zhang2008,
  abstract = {We present Model-based Analysis of ChIP-Seq data, MACS, which analyzes data generated by short read sequencers such as Solexa's Genome Analyzer. MACS empirically models the shift size of ChIP-Seq tags, and uses it to improve the spatial resolution of predicted binding sites. MACS also uses a dynamic Poisson distribution to effectively capture local biases in the genome, allowing for more robust predictions. MACS compares favorably to existing ChIP-Seq peak-finding algorithms, and is freely available.},
  author = {Zhang, Yong and Liu, Tao and a Meyer, Clifford and Eeckhoute, J\'er\^ome and Johnson, David S and Bernstein, Bradley E and Nussbaum, Chad and Myers, Richard M and Brown, Myles and Li, Wei and Liu, X Shirley},
  date = {2008-01},
  doi = {10/dfst4f},
  eprint = {18798982},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Zhang et al. - 2008 - Model-based Analysis of ChIP-Seq (MACS).pdf},
  issn = {1465-6906},
  journaltitle = {Genome Biology},
  keywords = {Algorithms,Cell Line,Chromatin Immunoprecipitation,Chromatin Immunoprecipitation: methods,Genetic,Hepatocyte Nuclear Factor 3-alpha,Hepatocyte Nuclear Factor 3-alpha: analysis,Hepatocyte Nuclear Factor 3-alpha: genetics,Humans,Models,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,Tumor},
  number = {9},
  pages = {R137},
  title = {Model-Based {{Analysis}} of {{ChIP}}-{{Seq}} ({{MACS}})},
  volume = {9}
}

@article{Zhang2014,
  abstract = {To characterize the role of the circadian clock in mouse physiology and behavior, we used RNA-seq and DNA arrays to quantify the transcriptomes of 12 mouse organs over time. We found 43\% of all protein coding genes showed circadian rhythms in transcription somewhere in the body, largely in an organ-specific manner. In most organs, we noticed the expression of many oscillating genes peaked during transcriptional "rush hours" preceding dawn and dusk. Looking at the genomic landscape of rhythmic genes, we saw that they clustered together, were longer, and had more spliceforms than nonoscillating genes. Systems-level analysis revealed intricate rhythmic orchestration of gene pathways throughout the body. We also found oscillations in the expression of more than 1,000 known and novel noncoding RNAs (ncRNAs). Supporting their potential role in mediating clock function, ncRNAs conserved between mouse and human showed rhythmic expression in similar proportions as protein coding genes. Importantly, we also found that the majority of best-selling drugs and World Health Organization essential medicines directly target the products of rhythmic genes. Many of these drugs have short half-lives and may benefit from timed dosage. In sum, this study highlights critical, systemic, and surprising roles of the mammalian circadian clock and provides a blueprint for advancement in chronotherapy.},
  author = {Zhang, Ray and Lahens, Nicholas F and Ballance, Heather I and Hughes, Michael E and Hogenesch, John B},
  date = {2014},
  doi = {10/634},
  eprint = {25349387},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Zhang et al. - 2014 - A circadian gene expression atlas in mammals impl.pdf},
  isbn = {1091-6490 (Electronic)\textbackslash{}r0027-8424 (Linking)},
  issn = {1091-6490},
  journaltitle = {Proceedings of the National Academy of Sciences of the United States of America},
  keywords = {Animals,Chronotherapy,Chronotherapy: methods,Circadian Rhythm,Circadian Rhythm: physiology,Databases; Nucleic Acid,Gene Expression Profiling,Gene Expression Profiling: methods,Gene Expression Regulation,Gene Expression Regulation: physiology,Humans,Mice,Transcriptome,Transcriptome: physiology},
  number = {45},
  pages = {16219-24},
  title = {A Circadian Gene Expression Atlas in Mammals: Implications for Biology and Medicine.},
  volume = {111}
}

@article{Zhao2015,
  author = {Zhao, Song and Niu, Feng and Xu, Chang Yan and Ye, Long and Bi, Gui Bin and Chen, Lin and Gong, Ping and Tian, Gang and Nie, Tian Hong},
  date = {2015},
  doi = {10/ggcxm8},
  file = {/Users/ryan/Documents/Zotero Library/Zhao et al. - 2015 - Microarray and ChIP-seq data analysis revealed cha.pdf},
  issn = {17913004},
  journaltitle = {Molecular Medicine Reports},
  keywords = {Chromatin immunoprecipitation-sequencing data,Differentially expressed genes,Functional enrichment analysis,Osteosarcoma,P53,Transcription factor},
  number = {3},
  pages = {4284-4290},
  title = {Microarray and {{ChIP}}-Seq Data Analysis Revealed Changes in P53-Mediated Transcriptional Regulation in {{Nutlin}}-3-Treated {{U2OS}} Cells},
  volume = {12}
}

@article{Zhou2009a,
  abstract = {Advances in sequencing technologies have accelerated the sequencing of new genomes, far outpacing the generation of gene and protein resources needed to annotate them. Direct comparison and alignment of existing cDNA sequences from a related species is an effective and readily available means to determine genes in the new genomes. Current spliced alignment programs are inadequate for comparing sequences between different species, owing to their low sensitivity and splice junction accuracy. A new spliced alignment tool, sim4cc, overcomes problems in the earlier tools by incorporating three new features: universal spaced seeds, to increase sensitivity and allow comparisons between species at various evolutionary distances, and powerful splice signal models and evolutionarily-aware alignment techniques, to improve the accuracy of gene models. When tested on vertebrate comparisons at diverse evolutionary distances, sim4cc had significantly higher sensitivity compared to existing alignment programs, more than 10\% higher than the closest competitor for some comparisons, while being comparable in speed to its predecessor, sim4. Sim4cc can be used in one-to-one or one-to-many comparisons of genomic and cDNA sequences, and can also be effectively incorporated into a high-throughput annotation engine, as demonstrated by the mapping of 64,000 Fagus grandifolia 454 ESTs and unigenes to the poplar genome.},
  author = {Zhou, Leming and Pertea, Mihaela and Delcher, Arthur L and Florea, Liliana},
  date = {2009-06},
  doi = {10/c6dnmp},
  eprint = {19429899},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Zhou et al. - 2009 - Sim4cc a cross-species spliced alignment program..pdf},
  isbn = {1111111111},
  issn = {1362-4962},
  journaltitle = {Nucleic acids research},
  keywords = {Algorithms,Animals,Dogs,Genome; Plant,Genomics,Genomics: methods,Humans,Mice,Reference Standards,RNA Splicing,Sequence Alignment,Sequence Alignment: methods,Sequence Alignment: standards,Software,Vertebrates,Vertebrates: genetics},
  number = {11},
  pages = {e80},
  title = {Sim4cc: A Cross-Species Spliced Alignment Program.},
  volume = {37}
}

@article{Zhou2011,
  abstract = {A number of penalization and shrinkage approaches have been proposed for the analysis of microarray gene expression data. Similar techniques are now routinely applied to RNA sequence transcriptional count data, although the value of such shrinkage has not been conclusively established. If penalization is desired, the explicit modeling of mean-variance relationships provides a flexible testing regimen that 'borrows' information across genes, while easily incorporating design effects and additional covariates.},
  author = {Zhou, Yi-Hui and Xia, Kai and a Wright, Fred},
  date = {2011-10-01},
  doi = {10/cscxcv},
  eprint = {21810900},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Zhou et al. - 2011 - A powerful and flexible approach to the analysis o.pdf},
  issn = {1367-4811},
  journaltitle = {Bioinformatics (Oxford, England)},
  keywords = {Base Sequence,Base Sequence: genetics,Gene Expression,Gene Expression Profiling,Gene Expression Profiling: methods,Models; Genetic,Models; Statistical,Sequence Analysis; RNA,Sequence Analysis; RNA: methods,Software,Transcription; Genetic,Transcriptome,Transcriptome: genetics},
  number = {19},
  pages = {2672-8},
  title = {A Powerful and Flexible Approach to the Analysis of {{RNA}} Sequence Count Data.},
  volume = {27}
}

@article{zhouRobustlyDetectingDifferential2014,
  abstract = {Abstract.  A popular approach for comparing gene expression levels between (replicated) conditions of RNA sequencing data relies on counting reads that map to f},
  author = {Zhou, Xiaobei and Lindsay, Helen and Robinson, Mark D.},
  date = {2014-06-17},
  doi = {10/f584s7},
  file = {/Users/ryan/Documents/Zotero Library/Zhou et al. - 2014 - Robustly detecting differential expression in RNA .pdf;/Users/ryan/Zotero/storage/DS5X6QM8/1427925.html},
  issn = {0305-1048},
  journaltitle = {Nucleic Acids Research},
  langid = {english},
  number = {11},
  pages = {e91-e91},
  shortjournal = {Nucleic Acids Res},
  title = {Robustly Detecting Differential Expression in {{RNA}} Sequencing Data Using Observation Weights},
  volume = {42}
}

@article{Zhu2007,
  abstract = {Taking advantage of the complete genome sequences of several mammals, we developed a novel method to detect losses of well-established genes in the human genome through syntenic mapping of gene structures between the human, mouse, and dog genomes. Unlike most previous genomic methods for pseudogene identification, this analysis is able to differentiate losses of well-established genes from pseudogenes formed shortly after segmental duplication or generated via retrotransposition. Therefore, it enables us to find genes that were inactivated long after their birth, which were likely to have evolved nonredundant biological functions before being inactivated. The method was used to look for gene losses along the human lineage during the approximately 75 million years (My) since the common ancestor of primates and rodents (the euarchontoglire crown group). We identified 26 losses of well-established genes in the human genome that were all lost at least 50 My after their birth. Many of them were previously characterized pseudogenes in the human genome, such as GULO and UOX. Our methodology is highly effective at identifying losses of single-copy genes of ancient origin, allowing us to find a few well-known pseudogenes in the human genome missed by previous high-throughput genome-wide studies. In addition to confirming previously known gene losses, we identified 16 previously uncharacterized human pseudogenes that are definitive losses of long-established genes. Among them is ACYL3, an ancient enzyme present in archaea, bacteria, and eukaryotes, but lost approximately 6 to 8 Mya in the ancestor of humans and chimps. Although losses of well-established genes do not equate to adaptive gene losses, they are a useful proxy to use when searching for such genetic changes. This is especially true for adaptive losses that occurred more than 250,000 years ago, since any genetic evidence of the selective sweep indicative of such an event has been erased.},
  author = {Zhu, Jingchun and Sanborn, J Zachary and Diekhans, Mark and Lowe, Craig B and Pringle, Tom H and Haussler, David},
  date = {2007-12},
  doi = {10/bt69c4},
  eprint = {18085818},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Zhu et al. - 2007 - Comparative genomics search for losses of long-est.pdf},
  issn = {1553-7358},
  journaltitle = {PLoS computational biology},
  keywords = {Animals,Biological Evolution,Chromosome Mapping,Chromosome Mapping: methods,DNA Mutational Analysis,DNA Mutational Analysis: methods,Dogs,Evolution; Molecular,Gene Deletion,Genetic Variation,Genetic Variation: genetics,Genome; Human,Genome; Human: genetics,Genomics,Genomics: methods,Humans,Mice,Pseudogenes,Pseudogenes: genetics},
  number = {12},
  pages = {e247},
  title = {Comparative Genomics Search for Losses of Long-Established Genes on the Human Lineage.},
  volume = {3}
}

@article{Zhu2010,
  abstract = {Chromatin immunoprecipitation (ChIP) followed by high-throughput sequencing (ChIP-seq) or ChIP followed by genome tiling array analysis (ChIP-chip) have become standard technologies for genome-wide identification of DNA-binding protein target sites. A number of algorithms have been developed in parallel that allow identification of binding sites from ChIP-seq or ChIP-chip datasets and subsequent visualization in the University of California Santa Cruz (UCSC) Genome Browser as custom annotation tracks. However, summarizing these tracks can be a daunting task, particularly if there are a large number of binding sites or the binding sites are distributed widely across the genome.},
  author = {Zhu, Lihua Julie and Gazin, Claude and Lawson, Nathan D and Pag\`es, Herv\'e and Lin, Simon M and Lapointe, David S and Green, Michael R},
  date = {2010-01},
  doi = {10/d5bzrs},
  eprint = {20459804},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Zhu et al. - 2010 - ChIPpeakAnno a Bioconductor package to annotate C.pdf},
  issn = {1471-2105},
  journaltitle = {BMC bioinformatics},
  keywords = {Binding Sites,Chromatin Immunoprecipitation,Chromatin Immunoprecipitation: methods,Genome,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,Software},
  pages = {237},
  title = {{{ChIPpeakAnno}}: A {{Bioconductor}} Package to Annotate {{ChIP}}-Seq and {{ChIP}}-Chip Data.},
  volume = {11}
}

@unpublished{Zhu2011,
  author = {Zhu, Lihua Julie},
  date = {2011-07-29},
  file = {/Users/ryan/Documents/Zotero Library/Zhu - 2011 - BioC2011  ChIPpeakAnno Practical.pdf},
  keywords = {annotation,chip-seq,presentation,R,tutorial},
  title = {{{BioC2011}} : {{ChIPpeakAnno Practical}}}
}

@article{Zhu2018,
  abstract = {Motivation: In RNA-seq differential expression analysis, investigators aim to detect those genes with changes in expression level across conditions, despite technical and biological variability in the observations. A common task is to accurately estimate the effect size, often in terms of a logarithmic fold change (LFC). Results: When the read counts are low or highly variable, the maximum likelihood estimates for the LFCs has high variance, leading to large estimates not representative of true differences, and poor ranking of genes by effect size. One approach is to introduce filtering thresholds and pseudocounts to exclude or moderate estimated LFCs. Filtering may result in a loss of genes from the analysis with true differences in expression, while pseudocounts provide a limited solution that must be adapted per dataset. Here, we propose the use of a heavy-Tailed Cauchy prior distribution for effect sizes, which avoids the use of filter thresholds or pseudocounts. The proposed method, Approximate Posterior Estimation for generalized linear model, apeglm, has lower bias than previously proposed shrinkage estimators, while still reducing variance for those genes with little information for statistical inference.},
  author = {Zhu, Anqi and Ibrahim, Joseph G. and Love, Michael I.},
  date = {2019},
  doi = {10/ggcxm9},
  file = {/Users/ryan/Documents/Zotero Library/Zhu et al. - 2019 - Heavy-Tailed prior distributions for sequence coun.pdf},
  issn = {14602059},
  journaltitle = {Bioinformatics},
  keywords = {effect size,empirical bayes,hierarchical model,log fold change,rna sequencing,rna-seq,shrinkage estimation,statistical method},
  number = {12},
  pages = {2084-2092},
  title = {Heavy-{{Tailed}} Prior Distributions for Sequence Count Data: {{Removing}} the Noise and Preserving Large Differences},
  volume = {35}
}

@article{Zisoulis2010,
  abstract = {MicroRNAs (miRNAs) regulate gene expression by guiding Argonaute proteins to specific target mRNA sequences. Identification of bona fide miRNA target sites in animals is challenging because of uncertainties regarding the base-pairing requirements between miRNA and target as well as the location of functional binding sites within mRNAs. Here we present the results of a comprehensive strategy aimed at isolating endogenous mRNA target sequences bound by the Argonaute protein ALG-1 in C. elegans. Using cross-linking and ALG-1 immunoprecipitation coupled with high-throughput sequencing (CLIP-seq), we identified extensive ALG-1 interactions with specific 3' untranslated region (UTR) and coding exon sequences and discovered features that distinguish miRNA complex binding sites in 3' UTRs from those in other genic regions. Furthermore, our analyses revealed a striking enrichment of Argonaute binding sites in genes important for miRNA function, suggesting an autoregulatory role that may confer robustness to the miRNA pathway.},
  author = {Zisoulis, Dimitrios G and Lovci, Michael T and Wilbert, Melissa L and Hutt, Kasey R and Liang, Tiffany Y and Pasquinelli, Amy E and Yeo, Gene W},
  date = {2010-02},
  doi = {10/d5bp77},
  eprint = {20062054},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Zisoulis et al. - 2010 - Comprehensive discovery of endogenous Argonaute bi.pdf},
  issn = {1545-9985},
  journaltitle = {Nature structural \& molecular biology},
  keywords = {Animals,Base Sequence,Binding Sites,Caenorhabditis elegans,Caenorhabditis elegans Proteins,Caenorhabditis elegans Proteins: metabolism,Caenorhabditis elegans: physiology,Chromatin Immunoprecipitation,Eukaryotic Initiation Factors,Eukaryotic Initiation Factors: metabolism,MicroRNAs,MicroRNAs: metabolism,Molecular Sequence Data,RNA; Helminth,RNA; Helminth: metabolism,RNA; Messenger,RNA; Messenger: metabolism,Sequence Analysis; DNA},
  number = {2},
  pages = {173-9},
  title = {Comprehensive Discovery of Endogenous {{Argonaute}} Binding Sites in {{Caenorhabditis}} Elegans.},
  volume = {17}
}

@article{Zitnik,
  abstract = {New technologies have enabled the investigation of biology and human health at an unprecedented scale and in multiple dimensions. These dimensions include a myriad of properties describing genome, epigenome, transcriptome, microbiome, phenotype, and lifestyle. No single data type, however, can capture the complexity of all the factors relevant to understanding a phenomenon such as a disease. Integrative methods that combine data from multiple technologies have thus emerged as critical statistical and computational approaches. The key challenge in developing such approaches is the identification of effective models to provide a comprehensive and relevant systems view. An ideal method can answer a biological or medical question, identifying important features and predicting outcomes, by harnessing heterogeneous data across several dimensions of biological variation. In this Review, we describe the principles of data integration and discuss current methods and available implementations. We provide examples of successful data integration in biology and medicine. Finally, we discuss current challenges in biomedical integrative methods and our perspective on the future development of the field.},
  archivePrefix = {arXiv},
  author = {Zitnik, Marinka and Nguyen, Francis and Wang, Bo and Leskovec, Jure and Goldenberg, Anna and Hoffman, Michael M.},
  date = {2019},
  doi = {10/gf7rj8},
  eprint = {1807.00123},
  eprinttype = {arxiv},
  file = {/Users/ryan/Documents/Zotero Library/Zitnik et al. - 2019 - Machine learning for integrating data in biology a.pdf},
  issn = {15662535},
  journaltitle = {Information Fusion},
  keywords = {Computational biology,Heterogeneous data,Machine learning,Personalized medicine,Systems biology},
  pages = {71-91},
  title = {Machine Learning for Integrating Data in Biology and Medicine: {{Principles}}, Practice, and Opportunities},
  volume = {50}
}

@article{Zou2014,
  author = {Zou, James and Lippert, Christoph and Heckerman, David and Aryee, Martin and Listgarten, Jennifer},
  date = {2014},
  doi = {10/gdsmqs},
  file = {/Users/ryan/Documents/Zotero Library/Zou et al. - 2014 - Epigenome-wide association studies without the nee.pdf},
  issn = {1548-7091},
  journaltitle = {Nature Methods},
  number = {3},
  pages = {309-311},
  title = {Epigenome-Wide Association Studies without the Need for Cell-Type Composition},
  volume = {11}
}

@article{Zwiener2014,
  abstract = {Gene expression measurements have successfully been used for building prognostic signatures, i.e for identifying a short list of important genes that can predict patient outcome. Mostly microarray measurements have been considered, and there is little advice available for building multivariable risk prediction models from RNA-Seq data. We specifically consider penalized regression techniques, such as the lasso and componentwise boosting, which can simultaneously consider all measurements and provide both, multivariable regression models for prediction and automated variable selection. However, they might be affected by the typical skewness, mean-variance-dependency or extreme values of RNA-Seq covariates and therefore could benefit from transformations of the latter. In an analytical part, we highlight preferential selection of covariates with large variances, which is problematic due to the mean-variance dependency of RNA-Seq data. In a simulation study, we compare different transformations of RNA-Seq data for potentially improving detection of important genes. Specifically, we consider standardization, the log transformation, a variance-stabilizing transformation, the Box-Cox transformation, and rank-based transformations. In addition, the prediction performance for real data from patients with kidney cancer and acute myeloid leukemia is considered. We show that signature size, identification performance, and prediction performance critically depend on the choice of a suitable transformation. Rank-based transformations perform well in all scenarios and can even outperform complex variance-stabilizing approaches. Generally, the results illustrate that the distribution and potential transformations of RNA-Seq data need to be considered as a critical step when building risk prediction models by penalized regression techniques.},
  author = {Zwiener, Isabella and Frisch, Barbara and Binder, Harald},
  date = {2014-01},
  doi = {10/gd85j7},
  eprint = {24416353},
  eprinttype = {pmid},
  file = {/Users/ryan/Documents/Zotero Library/Zwiener et al. - 2014 - Transforming RNA-Seq Data to Improve the Performan.pdf},
  issn = {1932-6203},
  journaltitle = {PloS one},
  number = {1},
  pages = {e85150},
  title = {Transforming {{RNA}}-{{Seq Data}} to {{Improve}} the {{Performance}} of {{Prognostic Gene Signatures}}.},
  volume = {9}
}

@preamble{ "\ifdefined\DeclarePrefChars\DeclarePrefChars{'’-}\else\fi " }