
R version 3.4.0 (2017-04-21) -- "You Stupid Darkness"
Copyright (C) 2017 The R Foundation for Statistical Computing
Platform: x86_64-pc-linux-gnu (64-bit)

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.

> library(S4Vectors)
Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: 'BiocGenerics'

The following objects are masked from 'package:parallel':

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from 'package:stats':

    IQR, mad, sd, var, xtabs

The following objects are masked from 'package:base':

    Filter, Find, Map, Position, Reduce, anyDuplicated, append,
    as.data.frame, cbind, colMeans, colSums, colnames, do.call,
    duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,
    lapply, lengths, mapply, match, mget, order, paste, pmax, pmax.int,
    pmin, pmin.int, rank, rbind, rowMeans, rowSums, rownames, sapply,
    setdiff, sort, table, tapply, union, unique, unsplit, which,
    which.max, which.min


Attaching package: 'S4Vectors'

The following object is masked from 'package:base':

    expand.grid

> library(xml2)
> getdis = function(file="hmdb_metabolites.xml") {
+  rr = read_xml(file)
+  rrl = as_list(rr)
+  acc = unlist(sapply(rrl, "[[", "accession"))
+  nm = unlist(sapply(rrl, "[[", "name"))
+  dl = sapply(rrl, "[[", "diseases")
+  diss = sapply(dl, function(x) try(unlist(sapply(x, "[[", "name"))))
+  omim = sapply(dl, function(x) try(unlist(sapply(x, "[[", "omim_id"))))
+  diss = sapply(diss, function(x) {
+     if (inherits(x, "try-error")) x = NA
+     x 
+     })
+  omim = sapply(omim, function(x) {
+     if (inherits(x, "try-error")) x = NA
+     x 
+     })
+  ns = unlist(sapply(diss, length))
+  ons = unlist(sapply(omim, length))
+  accs = rep(acc, ns)
+  nms = rep(nm, ns)
+  oaccs = rep(acc, ons)
+  disframe = DataFrame(accession=accs, name=nms, disease=unlist(diss))
+  omimframe = DataFrame(accession=oaccs, omim=unlist(omim))
+  plist = function (pp, prop="uniprot_id") {
+     #g = sapply(pp, "[[", "gene_name")
+     ps = try(unlist(sapply(pp[[2]], "[[", prop)))
+     if (inherits(ps, "try-error")) {
+           df = DataFrame(accession=pp[[1]], val=NA)
+           names(df)[2] = prop
+           return(df)
+           }
+     n = length(ps)
+     df = DataFrame(accession = rep(pp[[1]], n), val=ps)
+     names(df)[2] = prop
+     df
+   }
+  ppl = lapply(rrl, function(x) list(x$accession[[1]], x$protein))
+  aa = lapply(ppl, plist)
+  gg = lapply(ppl, plist, prop="gene_name")
+  list(disframe=disframe, omimframe=omimframe,
+        prots=do.call(rbind, aa), genes=do.call(rbind, gg))
+ #
+ }
> mm = getdis()
Error in FUN(X[[i]], ...) : subscript out of bounds
...
Error in FUN(X[[i]], ...) : subscript out of bounds
Error in FUN(X[[i]], ...) : subscript out of bounds
Error in FUN(X[[i]], ...) : subscript out of bounds
Error in FUN(X[[i]], ...) : subscript out of bounds
> bar_disease = mm[[1]]
> save(bar_disease, file="bar_disease.rda")
> bar_omim = mm[[2]]
> save(bar_omim, file="bar_omim.rda")
> bar_protein = mm[[3]]
> save(bar_protein, file="bar_protein.rda")
> bar_gene = mm[[4]]
> save(bar_gene, file="bar_gene.rda")
> 
