Bio3D in R

 

SF

Page history last edited by barry 2 yrs ago

Functions to fix


 

read.pdb

Return addational REMARK data from PDB file (e.g. Resolution etc.)

"read.pdb" <- function (file, maxlines=50000, multi=FALSE,
rm.insert=FALSE, rm.alt=TRUE, verbose=TRUE) {
if(missing(file)) {
stop("read.pdb: please specify a PDB 'file' for reading")
}
if(!is.numeric(maxlines)) {
stop("read.pdb: 'maxlines' must be numeric")
}
if(!is.logical(multi)) {
stop("read.pdb: 'multi' must be logical TRUE/FALSE")
}
# PDB FORMAT v2.0:    colpos,  datatype,    name,      description
atom.format <- matrix(c(-6,     NA,          NA,       # (ATOM)
5,     'numeric',   "eleno",   # atom_no
-1,     NA,          NA,        # (blank)
4,     'character', "elety",   # atom_ty
1,     'character', "alt",     # alt_loc
4,     'character', "resid",   # res_na 
1,     'character', "chain",   # chain_id 
4,     'numeric',   "resno",   # res_no
1,     'character', "insert",  # ins_code
-3,     NA,           NA,       # (blank)
8,     'numeric',   "x",       # x
8,     'numeric',   "y",       # y
8,     'numeric',   "z",       # z
6,     'numeric',   "o",       # o
6,     'numeric',   "b",       # b
-6,     NA,           NA,       # (blank)
4,     'character', "segid"    # seg_id
), ncol=3, byrow=TRUE,
dimnames = list(c(1:17), c("widths","what","name")) )
split.string <- function(x) {
# split a string 'x'
x <- substring(x, first, last)
x[nchar(x) == 0] <- as.character(NA)
x
}
is.character0 <- function(x){length(x)==0 & is.character(x)}
trim <- function (s) {
# Remove leading and traling
# spaces from character strings
s <- sub("^ +", "", s)
s <- sub(" +$", "", s)
s[(s=="")]<-NA
s
}
# finds first and last (substr positions)
widths <-  as.numeric(atom.format[,"widths"]) # fixed-width spec  
drop.ind <- (widths < 0) # cols to ignore (i.e. -ve)
widths <- abs(widths)    # absolute vales for later  
st <- c(1, 1 + cumsum( widths ))
first <- st[-length(st)][!drop.ind] # substr start
last <- cumsum( widths )[!drop.ind] # substr end
# read n lines of PDB file
raw.lines  <- readLines(file, n = maxlines)
type <- substring(raw.lines,1,6)
# check number of END/ENDMDL records
raw.end <- sort(c(which(type == "END"),
which(type == "ENDMDL")))
if (length(raw.end) > 1) {
print("PDB has multiple END/ENDMDL records")
if (!multi) {
print("multi=FALSE: taking first record only")
raw.lines <- raw.lines[ (1:raw.end[1]) ]
type <- type[ (1:raw.end[1]) ]
} else {
print("multi=TRUE: 'read.dcd' will be quicker!")
}
}
if ( length(raw.end) !=1 ) {
if (length(raw.lines) == maxlines) {
# have not yet read all the file
print("You may need to increase 'maxlines'")
print("check you have all data in $atom")
}
}
# split by record type
raw.header <- raw.lines[type == "HEADER"]
raw.seqres <- raw.lines[type == "SEQRES"]
raw.helix  <- raw.lines[type == "HELIX "]
raw.sheet  <- raw.lines[type == "SHEET "]
raw.atom   <- raw.lines[type == "ATOM  "]
het.atom   <- raw.lines[type == "HETATM"]
# also look for "TER" records
## Resolution
re <- grep("2 RESOLUTION", raw.lines[type == "REMARK"], value=TRUE)
re <- as.numeric(gsub("ANGSTROMS.","",gsub("REMARK   2 RESOLUTION. ","",re)))
rm(raw.lines)
if (verbose) {
if (!is.character0(raw.header)) { cat(" ", raw.header, "n") }
}
seqres <- unlist(strsplit( trim(substring(raw.seqres,19,80))," "))
helix  <- list(start = as.numeric(substring(raw.helix,22,25)),
end   = as.numeric(substring(raw.helix,34,37)),
chain = trim(substring(raw.helix,20,20)),
type  = trim(substring(raw.helix,39,40)))
sheet  <- list(start = as.numeric(substring(raw.sheet,23,26)),
end   = as.numeric(substring(raw.sheet,34,37)),
chain = trim(substring(raw.sheet,22,22)),
sense = trim(substring(raw.sheet,39,40)))
# format ATOM records as a character matrix
atom <- matrix(trim(sapply(raw.atom, split.string)), byrow=TRUE,
ncol=nrow(atom.format[ !drop.ind,]), 
dimnames = list(NULL, atom.format[ !drop.ind,"name"]) )
# Alt records with m[,"alt"] != NA
if (rm.alt) {
if ( sum( !is.na(atom[,"alt"]) ) > 0 ) {
cat("   PDB has ALT records, taking A only, rm.alt=TRUEn")
alt.inds <- which( (atom[,"alt"] != "A") ) # take first alt only
if(length(alt.inds)>0)
atom <- atom[-alt.inds,]
}
}
# Insert records with m[,"insert"] != NA
if (rm.insert) {
if ( sum( !is.na(atom[,"insert"]) ) > 0 ) {
cat("   PDB has INSERT records, removing, rm.insert=TRUEn")
insert.inds <- which(!is.na(atom[,"insert"])) # rm insert positions
atom <- atom[-insert.inds,]
}
}
het <- matrix(trim(sapply(het.atom, split.string)), byrow=TRUE,
ncol=nrow(atom.format[ !drop.ind,]), 
dimnames = list(NULL, atom.format[ !drop.ind,"name"]) )
output<-list(atom=atom,
het=het,
helix=helix,
sheet=sheet,
seqres=seqres,
xyz=as.numeric(t(atom[,c("x","y","z")])),
calpha = as.logical(atom[,"elety"]=="CA"),
res=re)
class(output) <- "pdb"
return(output)
}

 

 

 

plot.pca.loadings

Plot normalized residue wise loadings not individual x, y and z values.

e.g.

 

# utility function to coerce an 'xyz' vector to an 'atom' wise 3 columns matrix

as.vec3d <- function(x,byrow=T) { matrix(x,ncol=3,byrow=byrow) }

# utility function for the norm ( magnitude ) of a vector

vec.norm <- function(x) { sqrt(sum(x^2)) }

#therefore if you have a pca result called 'pc.xray' you could get the atom loadings

plot( apply( as.vec3d(pc.xray$U[,1]),1,vec.norm ) )

 

core.find

Add a core.extract function wich takes as input the output object from core.find and either a length or volume cutoff value and returns the atom and xyz indices of core positions.

 

plot.dmat

Add code to plot.dmat (or mono.colors and bwr.colors) to insure color scale is centered at 0.

 

orient.pdb

  • Should return an xyz vector (not an Nx3 matrix) DONE

 

  • when file=TRUE, pdb writing is broken: writes out pdb files with 1 atom and N frames, atoms are all Nitrogen and the same resno

 

a123()

should "." also be accepted as a gap character? DONE

 

  • Barry Q. Currently gap characters "-" and "." get mapped to "---". Is this sensible? I supose "GAP" is a possible residue type that I should not use as it may come up in PDB files as a HET group or something.
  • Ana A. I think '---' is good for the 3 letter gaps. I just suggested accepting '.' as a single character gap, because some programs use it (eg, CGC used dots for gaps at the starts or ends of sequences and dashes for 'real' gaps). For the three letter, dashes are definitely better.
  • Barry. A. Agreed and I have added the "." to "---" code. It was not there before as I automaticaly convert dots in input fasta files to dashes. I did this for consastancy and to avoid the risk of the other meaning of dots when matching strings.

 

 

gray.colors() & bwr.colors()

Both functions are not producing the right amount of colors, with either one over or one under the requested number being output FIXED

gray.colors name conflicts with other function - could use bgw.colors or mono.colors instead PICKED mono.colors, also changed the plot.dmat examples

 

identiy()

When called to compare 2 sequences, the identity value of seq1 vs seq1 doesn't come out as 1 (for loop) FIXED

When two sequences have no sequence identity the function returns NA. It would be better if it returned 0 sequence identity instead.

Then the resulting identity matrix can be used with id.filter (currently, if any two sequences in a multiple sequence alignment have no sequence identity, the ide.filter function crashes - because hclust doesn't like NAs).

 

atom.select()

Cannot take vectors as part of the selection string N.B. FIX THIS SOON

 

rmsd()

Should be able to take just one argument (ala identity() function) Need to Test this

seems fine.

 

consensus()

In the example, when the really cute plot gets drawn, maybe an extra line would be nice to separate amino acids from the gap and unknown rows (i.e. add 2.5 to the following list)

Comments (0)

You don't have permission to comment on this page.