| |
SF
Page history last edited by barry 2 yrs ago
Functions to fix
read.pdb
Return addational REMARK data from PDB file (e.g. Resolution etc.)
"read.pdb" <- function (file, maxlines=50000, multi=FALSE,
rm.insert=FALSE, rm.alt=TRUE, verbose=TRUE) {
if(missing(file)) {
stop("read.pdb: please specify a PDB 'file' for reading")
}
if(!is.numeric(maxlines)) {
stop("read.pdb: 'maxlines' must be numeric")
}
if(!is.logical(multi)) {
stop("read.pdb: 'multi' must be logical TRUE/FALSE")
}
# PDB FORMAT v2.0: colpos, datatype, name, description
atom.format <- matrix(c(-6, NA, NA, # (ATOM)
5, 'numeric', "eleno", # atom_no
-1, NA, NA, # (blank)
4, 'character', "elety", # atom_ty
1, 'character', "alt", # alt_loc
4, 'character', "resid", # res_na
1, 'character', "chain", # chain_id
4, 'numeric', "resno", # res_no
1, 'character', "insert", # ins_code
-3, NA, NA, # (blank)
8, 'numeric', "x", # x
8, 'numeric', "y", # y
8, 'numeric', "z", # z
6, 'numeric', "o", # o
6, 'numeric', "b", # b
-6, NA, NA, # (blank)
4, 'character', "segid" # seg_id
), ncol=3, byrow=TRUE,
dimnames = list(c(1:17), c("widths","what","name")) )
split.string <- function(x) {
# split a string 'x'
x <- substring(x, first, last)
x[nchar(x) == 0] <- as.character(NA)
x
}
is.character0 <- function(x){length(x)==0 & is.character(x)}
trim <- function (s) {
# Remove leading and traling
# spaces from character strings
s <- sub("^ +", "", s)
s <- sub(" +$", "", s)
s[(s=="")]<-NA
s
}
# finds first and last (substr positions)
widths <- as.numeric(atom.format[,"widths"]) # fixed-width spec
drop.ind <- (widths < 0) # cols to ignore (i.e. -ve)
widths <- abs(widths) # absolute vales for later
st <- c(1, 1 + cumsum( widths ))
first <- st[-length(st)][!drop.ind] # substr start
last <- cumsum( widths )[!drop.ind] # substr end
# read n lines of PDB file
raw.lines <- readLines(file, n = maxlines)
type <- substring(raw.lines,1,6)
# check number of END/ENDMDL records
raw.end <- sort(c(which(type == "END"),
which(type == "ENDMDL")))
if (length(raw.end) > 1) {
print("PDB has multiple END/ENDMDL records")
if (!multi) {
print("multi=FALSE: taking first record only")
raw.lines <- raw.lines[ (1:raw.end[1]) ]
type <- type[ (1:raw.end[1]) ]
} else {
print("multi=TRUE: 'read.dcd' will be quicker!")
}
}
if ( length(raw.end) !=1 ) {
if (length(raw.lines) == maxlines) {
# have not yet read all the file
print("You may need to increase 'maxlines'")
print("check you have all data in $atom")
}
}
# split by record type
raw.header <- raw.lines[type == "HEADER"]
raw.seqres <- raw.lines[type == "SEQRES"]
raw.helix <- raw.lines[type == "HELIX "]
raw.sheet <- raw.lines[type == "SHEET "]
raw.atom <- raw.lines[type == "ATOM "]
het.atom <- raw.lines[type == "HETATM"]
# also look for "TER" records
## Resolution
re <- grep("2 RESOLUTION", raw.lines[type == "REMARK"], value=TRUE)
re <- as.numeric(gsub("ANGSTROMS.","",gsub("REMARK 2 RESOLUTION. ","",re)))
rm(raw.lines)
if (verbose) {
if (!is.character0(raw.header)) { cat(" ", raw.header, "n") }
}
seqres <- unlist(strsplit( trim(substring(raw.seqres,19,80))," "))
helix <- list(start = as.numeric(substring(raw.helix,22,25)),
end = as.numeric(substring(raw.helix,34,37)),
chain = trim(substring(raw.helix,20,20)),
type = trim(substring(raw.helix,39,40)))
sheet <- list(start = as.numeric(substring(raw.sheet,23,26)),
end = as.numeric(substring(raw.sheet,34,37)),
chain = trim(substring(raw.sheet,22,22)),
sense = trim(substring(raw.sheet,39,40)))
# format ATOM records as a character matrix
atom <- matrix(trim(sapply(raw.atom, split.string)), byrow=TRUE,
ncol=nrow(atom.format[ !drop.ind,]),
dimnames = list(NULL, atom.format[ !drop.ind,"name"]) )
# Alt records with m[,"alt"] != NA
if (rm.alt) {
if ( sum( !is.na(atom[,"alt"]) ) > 0 ) {
cat(" PDB has ALT records, taking A only, rm.alt=TRUEn")
alt.inds <- which( (atom[,"alt"] != "A") ) # take first alt only
if(length(alt.inds)>0)
atom <- atom[-alt.inds,]
}
}
# Insert records with m[,"insert"] != NA
if (rm.insert) {
if ( sum( !is.na(atom[,"insert"]) ) > 0 ) {
cat(" PDB has INSERT records, removing, rm.insert=TRUEn")
insert.inds <- which(!is.na(atom[,"insert"])) # rm insert positions
atom <- atom[-insert.inds,]
}
}
het <- matrix(trim(sapply(het.atom, split.string)), byrow=TRUE,
ncol=nrow(atom.format[ !drop.ind,]),
dimnames = list(NULL, atom.format[ !drop.ind,"name"]) )
output<-list(atom=atom,
het=het,
helix=helix,
sheet=sheet,
seqres=seqres,
xyz=as.numeric(t(atom[,c("x","y","z")])),
calpha = as.logical(atom[,"elety"]=="CA"),
res=re)
class(output) <- "pdb"
return(output)
}
plot.pca.loadings
Plot normalized residue wise loadings not individual x, y and z values.
e.g.
# utility function to coerce an 'xyz' vector to an 'atom' wise 3 columns matrix as.vec3d <- function(x,byrow=T) { matrix(x,ncol=3,byrow=byrow) } # utility function for the norm ( magnitude ) of a vector vec.norm <- function(x) { sqrt(sum(x^2)) } #therefore if you have a pca result called 'pc.xray' you could get the atom loadings plot( apply( as.vec3d(pc.xray$U[,1]),1,vec.norm ) )
core.find
Add a core.extract function wich takes as input the output object from core.find and either a length or volume cutoff value and returns the atom and xyz indices of core positions.
plot.dmat
Add code to plot.dmat (or mono.colors and bwr.colors) to insure color scale is centered at 0.
orient.pdb
Should return an xyz vector (not an Nx3 matrix) DONE
when file=TRUE, pdb writing is broken: writes out pdb files with 1 atom and N frames, atoms are all Nitrogen and the same resno
a123()
should "." also be accepted as a gap character? DONE
- Barry Q. Currently gap characters "-" and "." get mapped to "---". Is this sensible? I supose "GAP" is a possible residue type that I should not use as it may come up in PDB files as a HET group or something.
- Ana A. I think '---' is good for the 3 letter gaps. I just suggested accepting '.' as a single character gap, because some programs use it (eg, CGC used dots for gaps at the starts or ends of sequences and dashes for 'real' gaps). For the three letter, dashes are definitely better.
- Barry. A. Agreed and I have added the "." to "---" code. It was not there before as I automaticaly convert dots in input fasta files to dashes. I did this for consastancy and to avoid the risk of the other meaning of dots when matching strings.
gray.colors() & bwr.colors()
Both functions are not producing the right amount of colors, with either one over or one under the requested number being output FIXED
gray.colors name conflicts with other function - could use bgw.colors or mono.colors instead PICKED mono.colors, also changed the plot.dmat examples
identiy()
When called to compare 2 sequences, the identity value of seq1 vs seq1 doesn't come out as 1 (for loop) FIXED
When two sequences have no sequence identity the function returns NA. It would be better if it returned 0 sequence identity instead.
Then the resulting identity matrix can be used with id.filter (currently, if any two sequences in a multiple sequence alignment have no sequence identity, the ide.filter function crashes - because hclust doesn't like NAs).
atom.select()
Cannot take vectors as part of the selection string N.B. FIX THIS SOON
rmsd()
Should be able to take just one argument (ala identity() function) Need to Test this
seems fine.
consensus()
In the example, when the really cute plot gets drawn, maybe an extra line would be nice to separate amino acids from the gap and unknown rows (i.e. add 2.5 to the following list)
SF
|
|
Tip: To turn text into a link, highlight the text, then click on a page or file from the list above.
|
|
|
|
|
Comments (0)
You don't have permission to comment on this page.