一、GDSC

GDSC : https://www.cancerrxgene.org/,已上传至阿里云盘

image-20221009090318241

1、原始数据整理

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
## 预处理
# library(tidyverse)
# #RAW 文件夹
# gdsc_drug = read.csv("GDSC_drug.csv")
# colnames(gdsc_drug) = gsub("[.]", "_", colnames(gdsc_drug))
# 
# gdsc_cl = read.csv("GDSC_cellline.csv")
# colnames(gdsc_cl) = gsub("[.]", "_", colnames(gdsc_cl))
# gdsc_cl = gdsc_cl %>% 
#   reshape2::dcast(Cell_line_Name+Model_ID+COSMIC_ID+TCGA_Classfication+Tissue+Tissue_sub_type~Datasets,
#                   value.var = "number_of_drugs")
# 
# GDSC1 = readxl::read_excel("GDSC1_fitted_dose_response_25Feb20.xlsx")
# GDSC1 = GDSC1[,c(-4, -6)]
# GDSC1 = GDSC1[,c(-6, -8, -9)]
# GDSC1 = GDSC1 %>% 
#   dplyr::select(DATASET, DRUG_NAME, CELL_LINE_NAME, TCGA_DESC, LN_IC50, AUC, RMSE, Z_SCORE, everything())
# GDSC1 = GDSC1 %>% as.data.frame()
# head(GDSC1)
# 
# GDSC2 = readxl::read_excel("GDSC2_fitted_dose_response_25Feb20.xlsx")
# GDSC2 = GDSC2[,c(-4, -6)]
# GDSC2 = GDSC2[,c(-6, -8, -9)]
# GDSC2 = GDSC2 %>% 
#   dplyr::select(DATASET, DRUG_NAME, CELL_LINE_NAME, TCGA_DESC, LN_IC50, AUC, RMSE, Z_SCORE, everything())
# GDSC2 = GDSC2 %>% as.data.frame()
# head(GDSC2)
# 
# GDSC_merge = rbind(GDSC1, GDSC2)
# head(GDSC_merge)
# 
# head(gdsc_cl)

2、敏感度实验结果

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
GDSC_res = read.csv("GDSC/GDSC_result.csv")
#   DATASET DRUG_NAME CELL_LINE_NAME    TCGA_DESC  LN_IC50      AUC     RMSE   Z_SCORE
# 1   GDSC1 Erlotinib         MC-CAR           MM 2.395685 0.982114 0.022521 -0.189576
# 2   GDSC1 Erlotinib            ES3 UNCLASSIFIED 3.140923 0.984816 0.031840  0.508635
# 3   GDSC1 Erlotinib            ES5 UNCLASSIFIED 3.968757 0.985693 0.026052  1.284229
# 4   GDSC1 Erlotinib            ES7 UNCLASSIFIED 2.692768 0.972699 0.110056  0.088760
# 5   GDSC1 Erlotinib          EW-11 UNCLASSIFIED 2.478678 0.944462 0.087011 -0.111820
# 6   GDSC1 Erlotinib        SK-ES-1 UNCLASSIFIED 2.034050 0.950763 0.016288 -0.528390

## 总共药物数
GDSC_res %>% 
  dplyr::distinct(DRUG_NAME) %>% 
  dim()
# [1] 449   1

## 每期药物数
GDSC_res %>% 
  dplyr::distinct(DATASET, DRUG_NAME) %>% 
  dplyr::count(DATASET, name = "Drugs")
#   DATASET Drugs
# 1   GDSC1   345
# 2   GDSC2   192

## 每个细胞系的实验数
GDSC_res %>% 
  dplyr::count(DATASET, CELL_LINE_NAME, name = "assays") %>% 
  reshape2::dcast(CELL_LINE_NAME ~ DATASET, value.var = "assays") %>% 
  dplyr::arrange(desc(GDSC1)) %>% head()
#   CELL_LINE_NAME GDSC1 GDSC2
# 1           A253   367   179
# 2          AMO-1   367   178
# 3         KCL-22   367   178
# 4         KNS-42   367    NA

summary(GDSC_res$LN_IC50)
#     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
# -10.5793   0.8435   2.6228   2.2052   4.1216  12.3591
summary(GDSC_res$AUC)
#    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 0.00479 0.78839 0.92309 0.84467 0.97306 0.99984 
cor(GDSC_res$LN_IC50, GDSC_res$AUC)
# [1] 0.7534196

关于IC50与AUC:https://blog.csdn.net/linkequa/article/details/88221975

理论上IC50值或者AUC值越小,表明细胞系对于药物越敏感。

药物敏感性评价指标_JasonKQLin的博客-CSDN博客

3、药物与细胞系信息

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
GDSC_drug = read.csv("GDSC/GDSC_drug.csv")
head(GDSC_drug)
#   Drug_Id              Name                    Synonyms                    Targets Target_pathway PubCHEM Datasets number_of_cell_lines Screening_site
# 1    1242 (5Z)-7-Oxozeaenol 5Z-7-Oxozeaenol, LL-Z1640-2                       TAK1 Other, kinases 9863776    GDSC1                  899         SANGER
# 2    1824            123138                                                          Unclassified            GDSC2                  717         SANGER
# 3    1820            123829                                                          Unclassified            GDSC2                  717         SANGER

GDSC_cl = read.csv("GDSC/GDSC_cl.csv")
head(GDSC_cl)
#   Cell_line_Name  Model_ID COSMIC_ID TCGA_Classfication            Tissue Tissue_sub_type GDSC1 GDSC2
# 1          22RV1 SIDM00499    924100               PRAD urogenital_system        prostate   353   282
# 2       23132-87 SIDM00980    910924               STAD  digestive_system         stomach   344   281
# 3       42-MG-BA SIDM00982    687561                GBM    nervous_system          glioma   345   281

https://cellmodelpassports.sanger.ac.uk/downloads

在上面网址可下载肿瘤细胞系的多种组学数据,包括转录组、基因组等,有需要时再整理。

二、CTRL

https://ocg.cancer.gov/programs/ctd2/data-portal/ ,已上传至阿里云盘

image-20221009114511170

1、原始数据处理

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# ## ctrlv1
# ctrlv1_res = data.table::fread("CTRPv1.0_2013_pub_Cell_154_1151/v10.D3.area_under_conc_curve.txt")
# 
# ctrlv1_drug = data.table::fread("CTRPv1.0_2013_pub_Cell_154_1151/v10.M1.informer_set.txt")
# 
# ctrlv1_cl = data.table::fread("CTRPv1.0_2013_pub_Cell_154_1151/v10.M2.cell_line_info.txt")
# 
# write.csv(ctrlv1_res, file = "CTRL/ctrl_v1_res.csv", row.names = F)
# write.csv(ctrlv1_drug, file = "CTRL/ctrl_v1_drug.csv", row.names = F)
# write.csv(ctrlv1_cl, file = "CTRL/ctrl_v1_cl.csv", row.names = F)
# 
# ## ctrlv2
# ctrlv2_res = data.table::fread("CTRPv2.0_2015_ctd2_ExpandedDataset/v20.data.curves_post_qc.txt")
# ctrlv2_res = ctrlv2_res %>% 
#   dplyr::left_join(exp_id[,c("experiment_id","master_ccl_id")]) %>% 
#   dplyr::select(master_ccl_id, master_cpd_id, area_under_curve, apparent_ec50_umol) %>% 
#   dplyr::group_by(master_ccl_id, master_cpd_id) %>% 
#   dplyr::summarise(area_under_curve = mean(area_under_curve),
#                    apparent_ec50_umol = mean(apparent_ec50_umol)) %>% as.data.frame()
# 
# 
# ctrlv2_drug = data.table::fread("CTRPv2.0_2015_ctd2_ExpandedDataset/v20.meta.per_compound.txt")
# 
# ctrlv2_cl = data.table::fread("CTRPv2.0_2015_ctd2_ExpandedDataset/v20.meta.per_cell_line.txt")
# 
# ctrlv2_res2 = ctrlv2_res %>% 
#   dplyr::left_join(ctrlv2_drug[,c("master_cpd_id","cpd_name")]) %>% 
#   dplyr::left_join(ctrlv2_cl[,c("master_ccl_id","ccl_name")]) %>% 
#   dplyr::select(cpd_name, ccl_name, area_under_curve, apparent_ec50_umol)
# 
# write.csv(ctrlv2_res2, file = "CTRL/ctrl_v2_res.csv", row.names = F)
# write.csv(ctrlv2_drug, file = "CTRL/ctrl_v2_drug.csv", row.names = F)
# write.csv(ctrlv2_cl, file = "CTRL/ctrl_v2_cl.csv", row.names = F)

v10.D3.area_under_conc_curve.txt:AUCs < 3.5 are considered sensitive to compound treatment, AUCs > 5.5 are considered non-responsive to compound treatment

即AUC值越小,表示细胞系对药物越敏感

  • IC50 (half maximal inhibitory concentration)
  • EC50 (concentration for 50% of maximal effect,EC50)

2、CTRL v1

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
### 敏感度实验结果
ctrl1_res = read.csv("CTRL/ctrl_v1_res.csv")
head(ctrl1_res)
#   ccl_name            cpd_name area_under_curve
# 1     U2OS          zebularine           6.0416
# 2     U2OS           maraviroc           6.8981
# 3     U2OS             DL-TBOA           6.8593

length(unique(ctrl1_res$cpd_name))
# [1] 354
length(unique(ctrl1_res$ccl_name))
# [1] 242
summary(ctrl1_res$area_under_curve)
#    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
# 0.04763  4.27830  5.81120  5.33269  6.72490 15.71900

### 药物/细胞系信息
ctrl1_drug = read.csv("CTRL/ctrl_v1_drug.csv")
t(ctrl1_drug[1,])
#                                 1                                                                         
# cpd_name                       "(-)-gallocatechin-3-monogallate"                                         
# cpd_synonym                    "gallocatechin gallate;L-GCG;GCG"                                         
# cpd_status                     "probe"                                                                   
# target_or_activity_of_compound "natural product"                                                         
# gene_symbol_of_protein_target  ""                                                                        
# top_test_conc_umol             "296"                                                                     
# percent_cpd_purity             NA                                                                        
# cpd_smiles                     "Oc1cc(O)c2C[C@@H](OC(=O)c3cc(O)c(O)c(O)c3)[C@@H](Oc2c1)c4cc(O)c(O)c(O)c4"
# pubchem_cid                    "199472"                                                                  
# broad_cpd_id                   "BRD-K19216856"                                                           
# master_cpd_id                  "411730" 

ctrl2_drug = read.csv("CTRL/ctrl_v1_cl.csv")
#   master_ccl_id ccl_name ccl_availability                  ccle_primary_site ccle_primary_hist                  ccle_hist_subtype_1
# 1             1      697      ccle;public haematopoietic_and_lymphoid_tissue lymphoid_neoplasm acute_lymphoblastic_B_cell_leukaemia
# 2             3     5637      ccle;public                      urinary_tract         carcinoma                                     
# 3             4  2313287      ccle;public                            stomach         carcinoma                       adenocarcinoma
# 4             5   1321N1             ccle             central_nervous_system            glioma                          astrocytoma

3、CTRL v2

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
### 敏感度实验结果
ctrl2_res = read.csv("CTRL/ctrl_v2_res.csv")
head(ctrl2_res)
#              cpd_name ccl_name area_under_curve apparent_ec50_umol
# 1             BRD4132      697           11.128             10.060
# 2             BRD6340      697           12.328             18.610
# 3               ML006      697           12.305             32.730
# 4 Bax channel blocker      697           13.085              8.402

length(unique(ctrl2_res$cpd_name))
# [1] 545
length(unique(ctrl2_res$ccl_name))
# [1] 887
summary(ctrl2_res$area_under_curve)
#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 0.0691 11.6080 13.4410 12.7647 14.5820 29.3500 

### 药物/细胞系信息
ctrl2_drug = read.csv("CTRL/ctrl_v2_drug.csv")
t(ctrl2_drug[1,])
#                                 1                                      
# master_cpd_id                  "1788"                                 
# cpd_name                       "CIL55"                                
# broad_cpd_id                   "BRD-K46556387"                        
# top_test_conc_umol             "10"                                   
# cpd_status                     "probe"                                
# inclusion_rationale            "pilot-set"                            
# gene_symbol_of_protein_target  ""                                     
# target_or_activity_of_compound "screening hit"                        
# source_name                    "Columbia University"                  
# source_catalog_id              ""                                     
# cpd_smiles                     "CN(C)CCNC(=O)c1cc2CSc3cc(Cl)ccc3-c2s1"

ctrl2_drug = read.csv("CTRL/ctrl_v2_cl.csv")
head(ctrl2_drug)
#   master_ccl_id ccl_name ccl_availability                  ccle_primary_site ccle_primary_hist                  ccle_hist_subtype_1
# 1             1      697      ccle;public haematopoietic_and_lymphoid_tissue lymphoid_neoplasm acute_lymphoblastic_B_cell_leukaemia
# 2             3     5637      ccle;public                      urinary_tract         carcinoma                                     
# 3             4  2313287      ccle;public                            stomach         carcinoma                       adenocarcinoma
# 4             5   1321N1             ccle             central_nervous_system            glioma                          astrocytoma