sqrt(10)
[1] 3.162278
The article provides a significant medical insight into Systemic Lupus Erythematosus (SLE) by identifying two distinct gene expression patterns in the immune cells of SLE patients, including disease-state and disease-activity signature. Furthermore, the disease-activity signature is clinically relevant to personalized treatment.
The primary genomics technology used in this study was bulk RNA sequencing (RNA-seq). The study also integrated data from other genomics technologies, including genome-wide association studies (GWAS), expression quantitative trait loci (eQTL) and whole-genome sequencing (WGS).
sqrt(10)
[1] 3.162278
log2(32)
[1] 5
sum(1:1000)
[1] 500500
sum(seq(2, 1000, by = 2))
[1] 250500
choose(100, 2)
[1] 4950
choose(100, 3)
[1] 161700
data(CO2)
help(CO2)
CO2 uptake of six plants in different CO2 concentration from Quebec and Mississippi.
summary(CO2$uptake)
Min. 1st Qu. Median Mean 3rd Qu. Max.
7.70 17.90 28.30 27.21 37.12 45.50
The average of the whole dataset is 27.21 and median is 28.30.
suppressMessages({library(tidyverse)})
%>% filter(Type == "Quebec") %>% pull(uptake) %>% summary() CO2
Min. 1st Qu. Median Mean 3rd Qu. Max.
9.30 30.32 37.15 33.54 40.15 45.50
%>% filter(Type == "Mississippi") %>% pull(uptake) %>% summary() CO2
Min. 1st Qu. Median Mean 3rd Qu. Max.
7.70 13.88 19.30 20.88 28.05 35.50
The average for Quebec is 33.54 and median is 37.15. The average for Mississippi is 20.88 and median is 19.30.
<- function(v) {
func return(
as.numeric(summary(v)["Mean"] / summary(v)["Median"]) %>%
`names<-`("Ratio of the mean and the median")
)
}
func(CO2$uptake)
Ratio of the mean and the median
0.9615935
<- function(v) {
fuc <- sort(v)
v <- v[-c(1,length(v))]
v return(mean(v))
}fuc(c(2, 1, 100, 3))
[1] 2.5
The pipe %>% in R improves readability by chaining operations left to right, reducing nested function calls. Use it when transforming data step by step. Avoid it for very simple code, deeply nested functions, or debugging, as pipes can obscure intermediate results and error sources.
The apply-family functions make it possible to apply a function over elements of vectors, lists, or rows/columns of matrices without explicit loops, making code cleaner, faster, and more efficient for data analysis, paticularly when we have more than one samples, count matrix, data frames, etc.
suppressMessages({library(nummenmaa)})
<- read_csv("magic_guys.csv", show_col_types = FALSE)
magic_guys <- magic_guys %>% filter(species == "jedi")
jedi <- magic_guys %>% filter(species == "sith")
sith
hist(jedi$length,
main = "Height Distribution of Jedi",
xlab = "Height",
col = "lightblue")
hist(sith$length,
main = "Height Distribution of ",
xlab = "Height",
col = "lightblue")
hist(jedi$length,
main = "Height Distribution of Jedi",
breaks = seq(min(jedi$length), max(jedi$length), length.out = 10),
xlab = "Height",
col = "lightblue")
hist(sith$length,
main = "Height Distribution of Sith",
breaks = seq(min(sith$length), max(sith$length), length.out = 10),
xlab = "Height",
col = "lightblue")
<- ggplot(magic_guys, aes(x = species, y = length, fill = species)) +
p1 geom_boxplot(alpha = 0.7, outlier.shape = 16, outlier.size = 2) +
labs(title = "Height Distribution by Species",
x = "Species",
y = "Height") +
theme_test() +
theme(legend.position = "none") +
scale_fill_manual(values = c("lightblue", "lightcoral"))
<- ggplot(magic_guys, aes(x = species, y = length, fill = species)) +
p2 geom_boxplot(alpha = 0.7, outlier.shape = NA) +
geom_jitter(width = 0.2, alpha = 0.5, size = 1.5) +
labs(title = "Height Distribution with Individual Data Points",
x = "Species",
y = "Height") +
theme_test() +
theme(legend.position = "none") +
scale_fill_manual(values = c("lightblue", "lightcoral"))
print(p1) ; print(p2)
# pdf("xxxxxxxxx", height = 5, width = 4) # save as pdf
# dev.off()
# ggsave("xxxxxxxxx", height = 5, width = 4) # save as pdf/svg/png
PDFs are best for high-quality vector graphics, PNG for web use or raster images, and SVG for scalable graphics in interactive or web contexts.
suppressMessages({library(patchwork)})
<- ggplot(CO2, aes(x = uptake)) +
p1 geom_histogram(bins = 20, fill = "lightblue", alpha = 0.7, color = "black") +
labs(title = "Distribution of CO2 Uptake Rates",
x = "CO2 Uptake Rate (μmol/m²·s)",
y = "Frequency") +
theme_minimal()
<- ggplot(CO2, aes(x = uptake, fill = Type)) +
p2 geom_histogram(bins = 15, alpha = 0.7, position = "identity") +
facet_grid(Treatment ~ Type) +
labs(title = "CO2 Uptake Distribution by Origin and Treatment",
x = "CO2 Uptake Rate (μmol/m²·s)",
y = "Frequency") +
theme_minimal() +
theme(legend.position = "bottom")
<- ggplot(CO2, aes(x = conc, y = uptake)) +
p3 geom_point(alpha = 0.6, size = 2) +
geom_smooth(method = "loess", se = TRUE, color = "red", linewidth = 1.2) +
labs(title = "CO2 Uptake vs Concentration (Overall Pattern)",
x = "CO2 Concentration (mL/L)",
y = "CO2 Uptake Rate (μmol/m²·s)") +
theme_minimal()
<- ggplot(CO2, aes(x = conc, y = uptake, color = Type)) +
p4 geom_point(alpha = 0.7, size = 2) +
geom_smooth(method = "loess", se = FALSE, linewidth = 1.2) +
facet_wrap(~Treatment) +
labs(title = "CO2 Uptake vs Concentration by Origin and Treatment",
x = "CO2 Concentration (mL/L)",
y = "CO2 Uptake Rate (μmol/m²·s)") +
theme_minimal() +
scale_color_manual(values = c("Quebec" = "blue", "Mississippi" = "red")) +
theme(legend.position = "bottom")
<- ggplot(CO2, aes(x = conc, y = uptake, color = Type)) +
p5 geom_line(aes(group = Plant), alpha = 0.7, linewidth = 0.8) +
geom_point(alpha = 0.5, size = 1.5) +
facet_grid(Treatment ~ Type) +
labs(title = "Individual Plant CO2 Response Curves",
x = "CO2 Concentration (mL/L)",
y = "CO2 Uptake Rate (μmol/m²·s)",
subtitle = "Each line represents one plant") +
theme_minimal() +
theme(legend.position = "none")
<- ggplot(CO2, aes(x = Treatment, y = uptake, fill = Treatment)) +
p6 geom_boxplot(alpha = 0.7) +
geom_jitter(width = 0.2, alpha = 0.5, size = 1) +
facet_wrap(~Type) +
labs(title = "CO2 Uptake Comparison: Chilled vs Non-chilled",
x = "Treatment",
y = "CO2 Uptake Rate (μmol/m²·s)") +
theme_minimal() +
theme(legend.position = "none") +
scale_fill_manual(values = c("nonchilled" = "orange", "chilled" = "lightblue"))
<- ggplot(CO2, aes(x = Type, y = uptake, fill = Type)) +
p7 geom_boxplot(alpha = 0.7) +
geom_jitter(width = 0.2, alpha = 0.5, size = 1) +
facet_wrap(~Treatment) +
labs(title = "CO2 Uptake Comparison: Quebec vs Mississippi Origins",
x = "Plant Origin",
y = "CO2 Uptake Rate (μmol/m²·s)") +
theme_minimal() +
theme(legend.position = "none") +
scale_fill_manual(values = c("Quebec" = "darkblue", "Mississippi" = "darkred"))
<- ggplot(CO2, aes(x = factor(conc), y = uptake, fill = interaction(Type, Treatment))) +
p8 geom_boxplot(alpha = 0.7) +
labs(title = "CO2 Uptake at Different Concentration Levels",
x = "CO2 Concentration (mL/L)",
y = "CO2 Uptake Rate (μmol/m²·s)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.title = element_blank()) +
scale_fill_manual(values = c("Quebec.nonchilled" = "lightblue",
"Quebec.chilled" = "blue",
"Mississippi.nonchilled" = "lightcoral",
"Mississippi.chilled" = "red"))
<- CO2 %>%
interaction_data group_by(Type, Treatment, conc) %>%
summarise(mean_uptake = mean(uptake),
se_uptake = sd(uptake) / sqrt(n()),
.groups = 'drop')
<- ggplot(interaction_data, aes(x = conc, y = mean_uptake, color = Type, linetype = Treatment)) +
p9 geom_line(linewidth = 1.2) +
geom_point(size = 3) +
geom_errorbar(aes(ymin = mean_uptake - se_uptake, ymax = mean_uptake + se_uptake),
width = 20, alpha = 0.7) +
labs(title = "Mean CO2 Uptake Response with Standard Errors",
x = "CO2 Concentration (mL/L)",
y = "Mean CO2 Uptake Rate (μmol/m²·s)",
subtitle = "Error bars represent standard error of the mean") +
theme_minimal() +
scale_color_manual(values = c("Quebec" = "blue", "Mississippi" = "red")) +
theme(legend.position = "bottom")
wrap_plots(p1, p2, p3, p4, p5, p6, p7, p8, p9, nrow = 2)
library(tidybiology)
data("chromosome")
data("proteins")
# Extract summary statistics using tidyverse functions
<- chromosome %>%
chromosome_summary summarise(
# Variations statistics
variations_mean = mean(variations, na.rm = TRUE),
variations_median = median(variations, na.rm = TRUE),
variations_max = max(variations, na.rm = TRUE),
# Protein coding genes statistics
protein_codinggenes_mean = mean(protein_codinggenes, na.rm = TRUE),
protein_codinggenes_median = median(protein_codinggenes, na.rm = TRUE),
protein_codinggenes_max = max(protein_codinggenes, na.rm = TRUE),
# miRNAs statistics
miRNAs_mean = mean(mi_rna, na.rm = TRUE),
miRNAs_median = median(mi_rna, na.rm = TRUE),
miRNAs_max = max(mi_rna, na.rm = TRUE)
)
# Display results in a more readable format
<- chromosome_summary %>%
summary_long pivot_longer(
cols = everything(),
names_to = c("variable", "statistic"),
names_sep = "_(?=[^_]+$)", # Split on the last underscore
values_to = "value"
%>%
) pivot_wider(
names_from = statistic,
values_from = value
%>%
) mutate(
variable = case_when(
== "variations" ~ "Variations",
variable == "protein_codinggenes" ~ "Protein Coding Genes",
variable == "miRNAs" ~ "miRNAs"
variable
),across(c(mean, median, max), ~round(.x, 2))
)
print(summary_long)
# A tibble: 3 × 4
variable mean median max
<chr> <dbl> <dbl> <dbl>
1 Variations 6484572. 6172346 12945965
2 Protein Coding Genes 850. 836 2058
3 miRNAs 73.2 75 134
<- ggplot(chromosome, aes(x = length_mm)) +
p1 geom_histogram(bins = 15, fill = "steelblue", alpha = 0.7, color = "black") +
geom_vline(aes(xintercept = mean(length_mm)),
color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = median(length_mm)),
color = "orange", linetype = "dashed", size = 1) +
labs(title = "Distribution of Human Chromosome Sizes",
x = "Chromosome Length (mm)",
y = "Frequency",
subtitle = "Red line: Mean, Orange line: Median") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
plot.subtitle = element_text(hjust = 0.5, size = 10))
<- ggplot(chromosome, aes(y = length_mm)) +
p2 geom_boxplot(fill = "lightcoral", alpha = 0.7, width = 0.5) +
geom_jitter(aes(x = 0), width = 0.1, alpha = 0.6, size = 2, color = "darkred") +
labs(title = "Chromosome Size Distribution (Boxplot)",
y = "Chromosome Length (mm)") +
theme_minimal() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))
<- chromosome %>%
p3 arrange(desc(length_mm)) %>%
mutate(id = factor(id, levels = id)) %>%
ggplot(aes(x = id, y = length_mm)) +
geom_col(fill = "darkorchid", alpha = 0.8) +
labs(title = "Individual Chromosome Sizes",
x = "Chromosome",
y = "Length (mm)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))
wrap_plots(p1, p2, p3, nrow = 1)
<- cor(chromosome$length_mm, chromosome$protein_codinggenes, use = "complete.obs")
cor_protein_genes <- cor(chromosome$length_mm, chromosome$mi_rna, use = "complete.obs")
cor_mirnas
<- ggplot(chromosome, aes(x = length_mm, y = protein_codinggenes)) +
p4 geom_point(size = 3, alpha = 0.7, color = "darkgreen") +
geom_smooth(method = "lm", se = TRUE, color = "red", linewidth = 1.2) +
labs(title = "Protein Coding Genes vs Chromosome Length",
x = "Chromosome Length (mm)",
y = "Number of Protein Coding Genes",
subtitle = paste("Correlation coefficient:", round(cor_protein_genes, 3))) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
plot.subtitle = element_text(hjust = 0.5, size = 12))
<- ggplot(chromosome, aes(x = length_mm, y = mi_rna)) +
p5 geom_point(size = 3, alpha = 0.7, color = "darkblue") +
geom_smooth(method = "lm", se = TRUE, color = "orange", linewidth = 1.2) +
labs(title = "miRNAs vs Chromosome Length",
x = "Chromosome Length (mm)",
y = "Number of miRNAs",
subtitle = paste("Correlation coefficient:", round(cor_mirnas, 3))) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
plot.subtitle = element_text(hjust = 0.5, size = 12))
wrap_plots(p4, p5, nrow = 1)
<- proteins %>%
proteins_summary summarise(
across(c(length, mass),
list(mean = ~mean(.x, na.rm = TRUE),
median = ~median(.x, na.rm = TRUE),
max = ~max(.x, na.rm = TRUE)),
.names = "{.col}_{.fn}")
%>%
) pivot_longer(
cols = everything(),
names_to = c("variable", "statistic"),
names_sep = "_",
values_to = "value"
%>%
) pivot_wider(
names_from = statistic,
values_from = value
%>%
) mutate(
variable = case_when(
== "length" ~ "Protein Length",
variable == "mass" ~ "Protein Mass"
variable
),across(c(mean, median, max), ~round(.x, 2))
)
proteins_summary
suppressMessages({library(scales)})
<- cor(proteins$length, proteins$mass, use = "complete.obs")
cor_length_mass
<- proteins %>%
p6 filter(!is.na(length), !is.na(mass)) %>%
filter(length < quantile(length, 0.99, na.rm = TRUE),
< quantile(mass, 0.99, na.rm = TRUE)) %>%
mass ggplot(aes(x = length, y = mass)) +
geom_hex(alpha = 0.8, bins = 30) +
geom_smooth(method = "lm", se = TRUE, color = "red", linewidth = 1.5) +
scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Count") +
labs(title = "Relationship Between Protein Length and Mass",
x = "Protein Length (amino acids)",
y = "Protein Mass (Da)",
subtitle = paste("Correlation coefficient:", round(cor_length_mass, 3))) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 16, face = "bold", color = "darkblue"),
plot.subtitle = element_text(hjust = 0.5, size = 12, color = "gray40"),
axis.title = element_text(size = 12, face = "bold"),
legend.position = "right",
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "gray98", color = NA),
plot.background = element_rect(fill = "white", color = NA)
+
) scale_x_continuous(labels = comma_format()) +
scale_y_continuous(labels = comma_format())
<- proteins %>%
p7 filter(!is.na(length), !is.na(mass)) %>%
filter(length < quantile(length, 0.95, na.rm = TRUE),
< quantile(mass, 0.95, na.rm = TRUE)) %>%
mass ggplot(aes(x = length, y = mass)) +
geom_point(alpha = 0.3, size = 0.8, color = "steelblue") +
geom_density_2d_filled(alpha = 0.6, contour_var = "ndensity") +
geom_smooth(method = "lm", se = FALSE, color = "red", linewidth = 2) +
scale_fill_viridis_d(name = "Density", option = "plasma") +
labs(title = "Protein Length vs Mass Relationship",
x = "Protein Length (amino acids)",
y = "Protein Mass (Da)",
subtitle = "Density contours show data distribution patterns") +
theme_dark() +
theme(
plot.title = element_text(hjust = 0.5, size = 16, face = "bold", color = "white"),
plot.subtitle = element_text(hjust = 0.5, size = 11, color = "gray80"),
axis.title = element_text(size = 12, face = "bold", color = "white"),
axis.text = element_text(color = "white"),
legend.title = element_text(color = "white"),
legend.text = element_text(color = "white"),
panel.grid.major = element_line(color = "gray30", size = 0.3),
panel.grid.minor = element_blank(),
legend.background = element_rect(fill = "gray10", color = NA),
plot.background = element_rect(fill = "gray10", color = NA)
+
) scale_x_continuous(labels = comma_format()) +
scale_y_continuous(labels = comma_format())
wrap_plots(p6, p7, nrow = 1)